Patchwork [2/2] vfio-powerpc: added VFIO support

login
register
mail settings
Submitter Alexey Kardashevskiy
Date July 10, 2012, 5:51 a.m.
Message ID <1341899497-23265-3-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/170049/
State New
Headers show

Comments

Alexey Kardashevskiy - July 10, 2012, 5:51 a.m.
The patch enables VFIO on POWER.

It literally does the following:

1. POWERPC IOMMU support (the kernel counterpart is required)

2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation.

3. Added vfio_get_container_fd() to VFIO in order to initialize 1).

4. Makefile fixed and "is_vfio" flag added into sPAPR PHB - required to
distinguish VFIO's DMA context from the emulated one.

WIth the pathes posted today a bit earlier, this patch fully supports
VFIO what includes MSIX as well,


Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/ppc/Makefile.objs |    3 ++
 hw/spapr.h           |    4 +++
 hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/spapr_pci.c       |   23 ++++++++++++-
 hw/spapr_pci.h       |    2 ++
 hw/vfio_pci.c        |   76 +++++++++++++++++++++++++++++++++++++++++--
 hw/vfio_pci.h        |    2 ++
 7 files changed, 193 insertions(+), 4 deletions(-)
Alex Williamson - July 10, 2012, 4:55 p.m.
On Tue, 2012-07-10 at 15:51 +1000, Alexey Kardashevskiy wrote:
> The patch enables VFIO on POWER.
> 
> It literally does the following:
> 
> 1. POWERPC IOMMU support (the kernel counterpart is required)
> 
> 2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation.
> 
> 3. Added vfio_get_container_fd() to VFIO in order to initialize 1).
> 
> 4. Makefile fixed and "is_vfio" flag added into sPAPR PHB - required to
> distinguish VFIO's DMA context from the emulated one.
> 
> WIth the pathes posted today a bit earlier, this patch fully supports
> VFIO what includes MSIX as well,
> 
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/ppc/Makefile.objs |    3 ++
>  hw/spapr.h           |    4 +++
>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/spapr_pci.c       |   23 ++++++++++++-
>  hw/spapr_pci.h       |    2 ++
>  hw/vfio_pci.c        |   76 +++++++++++++++++++++++++++++++++++++++++--
>  hw/vfio_pci.h        |    2 ++
>  7 files changed, 193 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index f573a95..c46a049 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>  # Xilinx PPC peripherals
>  obj-y += xilinx_ethlite.o
>  
> +# VFIO PCI device assignment
> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> +
>  obj-y := $(addprefix ../,$(obj-y))
> diff --git a/hw/spapr.h b/hw/spapr.h
> index b37f337..9dca704 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>                        DMAContext *dma);
>  
> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size);
> +
>  #endif /* !defined (__HW_SPAPR_H__) */
> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> index 50c288d..0a194e8 100644
> --- a/hw/spapr_iommu.c
> +++ b/hw/spapr_iommu.c
> @@ -16,6 +16,8 @@
>   * You should have received a copy of the GNU Lesser General Public
>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>   */
> +#include <sys/ioctl.h>
> +
>  #include "hw.h"
>  #include "kvm.h"
>  #include "qdev.h"
> @@ -23,6 +25,7 @@
>  #include "dma.h"
>  
>  #include "hw/spapr.h"
> +#include "hw/linux-vfio.h"

I really need to move this into linux-headers.

>  
>  #include <libfdt.h>
>  
> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>      return 0;
>  }
>  
> +/* -------- API for POWERPC IOMMU -------- */
> +
> +#define POWERPC_IOMMU           2
> +
> +struct tce_iommu_info {
> +    __u32 argsz;
> +    __u32 dma32_window_start;
> +    __u32 dma32_window_size;
> +};
> +
> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +struct tce_iommu_dma_map {
> +    __u32 argsz;
> +    __u64 va;
> +    __u64 dmaaddr;
> +};
> +
> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)

I assume this would eventually go into the kernel vfio.h with a VFIO_
prefix.  Add a flags field to the structures or it'll be hard to extend
them later.

> +typedef struct sPAPRVFIOTable {
> +    int fd;
> +    uint32_t liobn;
> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> +} sPAPRVFIOTable;
> +
> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> +
> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> +
> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
> +        return;
> +    }
> +    *dma32_window_start = info.dma32_window_start;
> +    *dma32_window_size = info.dma32_window_size;
> +
> +    t = g_malloc0(sizeof(*t));
> +    t->fd = fd;
> +    t->liobn = liobn;
> +
> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> +}
> +
> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_dma_map map = {
> +        .argsz = sizeof(map),
> +        .va = 0,
> +        .dmaaddr = ioba,
> +    };
> +
> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> +        if (t->liobn != liobn) {
> +            continue;
> +        }
> +        if (tce) {
> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
> +                return H_PARAMETER;
> +            }
> +        } else {
> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
> +                return H_PARAMETER;
> +            }
> +        }
> +        return H_SUCCESS;
> +    }
> +    return H_CONTINUE; /* positive non-zero value */
> +}
> +

I wish you could do this through a MemoryListener like we do on x86.

>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>                                target_ulong opcode, target_ulong *args)
>  {
> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>      if (0 >= ret) {
>          return ret ? H_PARAMETER : H_SUCCESS;
>      }
> +    ret = put_tce_vfio(liobn, ioba, tce);
> +    if (0 >= ret) {
> +        return ret ? H_PARAMETER : H_SUCCESS;
> +    }
>  #ifdef DEBUG_TCE
>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 5f89003..3375c3f 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -29,6 +29,7 @@
>  #include "pci_host.h"
>  #include "hw/spapr.h"
>  #include "hw/spapr_pci.h"
> +#include "hw/vfio_pci.h"
>  #include "exec-memory.h"
>  #include <libfdt.h>
>  #include "trace.h"
> @@ -440,6 +441,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>                   level);
>  }
>  
> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> +{
> +    sPAPRPHBState *phb = opaque;
> +    return phb->lsi_table[irq_num].dt_irq;
> +}
> +
>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>                                unsigned size)
>  {
> @@ -567,7 +574,8 @@ static int spapr_phb_init(SysBusDevice *s)
>  
>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>                             phb->busname ? phb->busname : phb->dtbusname,
> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> +                           pci_spapr_map_irq, phb,
>                             &phb->memspace, &phb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>      phb->host_state.bus = bus;
> @@ -596,6 +604,7 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> +    DEFINE_PROP_UINT8("vfio", sPAPRPHBState, is_vfio, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> @@ -639,6 +648,18 @@ void spapr_create_phb(sPAPREnvironment *spapr,
>  /* Finalize PCI setup, called when all devices are already created */
>  int spapr_finalize_pci_setup(sPAPRPHBState *phb)
>  {
> +    if (phb->is_vfio) {
> +        int fd = vfio_get_container_fd(phb->host_state.bus);
> +
> +        if (fd < 0) {
> +            return fd;
> +        }
> +        spapr_vfio_init_dma(fd, phb->dma_liobn,
> +                            &phb->dma_window_start,
> +                            &phb->dma_window_size);
> +        return 0;
> +    }
> +
>      phb->dma_window_start = 0;
>      phb->dma_window_size = 0x40000000;
>      phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> index 3aae273..a4f031b 100644
> --- a/hw/spapr_pci.h
> +++ b/hw/spapr_pci.h
> @@ -57,6 +57,8 @@ typedef struct sPAPRPHBState {
>          int nvec;
>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>  
> +    uint8_t is_vfio;
> +
>      QLIST_ENTRY(sPAPRPHBState) list;
>  } sPAPRPHBState;
>  
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 1ac287f..cc0b974 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -21,7 +21,6 @@
>  #include <dirent.h>
>  #include <stdio.h>
>  #include <unistd.h>
> -#include <sys/io.h>
>  #include <sys/ioctl.h>
>  #include <sys/mman.h>
>  #include <sys/types.h>
> @@ -44,6 +43,17 @@
>  #include "vfio_pci.h"
>  #include "linux-vfio.h"
>  
> +#ifndef TARGET_PPC64
> +#include <sys/io.h>
> +#define VFIO_IOMMU_EXTENSION    VFIO_X86_IOMMU
> +#else
> +#include "hw/pci_internals.h"
> +#include "hw/xics.h"
> +#include "hw/spapr.h"
> +#define POWERPC_IOMMU           2
> +#define VFIO_IOMMU_EXTENSION    POWERPC_IOMMU
> +#endif
> +

VFIO_IOMMU_EXTENSION never gets used, POWER_IOMMU is redefined below.

>  //#define DEBUG_VFIO
>  #ifdef DEBUG_VFIO
>  #define DPRINTF(fmt, ...) \
> @@ -235,6 +245,7 @@ struct vfio_irq_set_fd {
>  
>  static void vfio_enable_intx_kvm(VFIODevice *vdev)
>  {
> +#ifndef TARGET_PPC64

Why do you need this, aren't the extension checks sufficient for this to
be a nop for you?

>  #ifdef CONFIG_KVM
>      struct vfio_irq_set_fd irq_set_fd = {
>  	.irq_set = {
> @@ -298,10 +309,12 @@ fail:
>      qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
>      vfio_unmask_intx(vdev);
>  #endif
> +#endif
>  }
>  
>  static void vfio_disable_intx_kvm(VFIODevice *vdev)
>  {
> +#ifndef TARGET_PPC64

Same

>  #ifdef CONFIG_KVM
>      struct vfio_irq_set_fd irq_set_fd = {
>  	.irq_set = {
> @@ -350,8 +363,10 @@ static void vfio_disable_intx_kvm(VFIODevice *vdev)
>      DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", __FUNCTION__,
>              vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
>  #endif
> +#endif
>  }
>  
> +#ifndef TARGET_PPC64
>  static void vfio_update_irq(Notifier *notify, void *data)
>  {
>      VFIODevice *vdev = container_of(notify, VFIODevice, intx.update_irq);
> @@ -381,6 +396,7 @@ static void vfio_update_irq(Notifier *notify, void *data)
>      /* Re-enable the interrupt in cased we missed an EOI */
>      vfio_eoi(&vdev->intx.eoi, NULL);
>  }
> +#endif
>  
>  static int vfio_enable_intx(VFIODevice *vdev)
>  {
> @@ -404,10 +420,14 @@ static int vfio_enable_intx(VFIODevice *vdev)
>      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
>      vdev->intx.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
>      vdev->intx.eoi.notify = vfio_eoi;
> +#ifndef TARGET_PPC64
>      ioapic_add_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);

This is really only a place holder for x86 too, I don't think my eoi
notifier as written is acceptable upstream.  We really need some common
infrastructure here.  I'm hoping to get the kvm acceleration in place
which would make vfio usable on x86 with kvm (the common case), then
work towards a generic eoi notifier.

>  
>      vdev->intx.update_irq.notify = vfio_update_irq;
>      pci_add_irq_update_notifier(&vdev->pdev, &vdev->intx.update_irq);

Can't you stub this out to make it safe to do on POWER too?

> +#else
> +    xics_add_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> +#endif
>  
>      if (event_notifier_init(&vdev->intx.interrupt, 0)) {
>          error_report("vfio: Error: event_notifier_init failed\n");
> @@ -440,8 +460,12 @@ static void vfio_disable_intx(VFIODevice *vdev)
>      vfio_disable_intx_kvm(vdev);
>      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
>  
> +#ifndef TARGET_PPC64
>      pci_remove_irq_update_notifier(&vdev->intx.update_irq);
>      ioapic_remove_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> +#else
> +    xics_remove_eoi_notifier(&vdev->intx.eoi);
> +#endif
>  
>      fd = event_notifier_get_fd(&vdev->intx.interrupt);
>      qemu_set_fd_handler(fd, NULL, NULL, vdev);
> @@ -543,7 +567,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>      }
>  
>      fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> -
> +#ifndef TARGET_PPC64
>      vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>      if (vdev->msi_vectors[vector].virq < 0 || 
>          kvm_irqchip_add_irqfd(kvm_state, fd,
> @@ -551,7 +575,11 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>          qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>                              &vdev->msi_vectors[vector]);
>      }
> -
> +#else
> +    vdev->msi_vectors[vector].virq = -1;
> +    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> +                        &vdev->msi_vectors[vector]);
> +#endif

This shouldn't be necessary once the abort is removed from
kvm_irqchip_add_msi_route.  It'll be merged next time the kvm uq tree
merges into qemu.

>      if (vdev->nr_vectors < vector + 1) {
>          int i;
>  
> @@ -692,6 +720,7 @@ retry:
>          fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
>  
>          msg = msi_get_msg(&vdev->pdev, i);
> +#ifndef TARGET_PPC64
>          vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>          if (vdev->msi_vectors[i].virq < 0 || 
>              kvm_irqchip_add_irqfd(kvm_state, fd,
> @@ -699,6 +728,12 @@ retry:
>              qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>                                  &vdev->msi_vectors[i]);
>          }
> +#else
> +        vdev->msi_vectors[i].virq = -1;
> +        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> +                            &vdev->msi_vectors[i]);
> +        msg = msg;
> +#endif

Same here

>      }
>      
>      ret = vfio_enable_vectors(vdev, false);
> @@ -1581,6 +1616,25 @@ static int vfio_connect_container(VFIOGroup *group)
>  
>          memory_listener_register(&container->listener, get_system_memory());
>  
> +#define POWERPC_IOMMU           2

Assume this will go in the kernel vfio.h at some point.  You may want to
pick a different name if there's a possibility of other powerpc iommu
implementations... thus the crappy type1 name for x86.

> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> +        if (ret) {
> +            error_report("vfio: failed to set group container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
> +
> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
> +        if (ret) {
> +            error_report("vfio: failed to set iommu for container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
>      } else {
>          error_report("vfio: No available IOMMU models\n");
>          g_free(container);
> @@ -2005,3 +2059,19 @@ static void register_vfio_pci_dev_type(void)
>  }
>  
>  type_init(register_vfio_pci_dev_type)
> +
> +int vfio_get_container_fd(struct PCIBus *pbus)
> +{
> +    BusChild *kid1st = QTAILQ_FIRST(&pbus->qbus.children);
> +    VFIODevice *vdev1st;
> +
> +    if (!kid1st) {
> +        printf("No device registered on PCI bus \"%s\", no DMA enabled\n",
> +               pbus->qbus.name);
> +        return -1;
> +    }
> +    vdev1st = container_of(kid1st->child, VFIODevice, pdev.qdev);
> +
> +    return vdev1st->group->container->fd;
> +}
> +

This is not a generic implementation.  x86 won't have all devices on a
bus be vfio devices and even if it did, there's no guarantee they all
belong to the same container.  This should probably at least take a
PCIDevice and some kind of POWER specific code will need to know that
the container is the same for the whole bus.  Thanks,

Alex

> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> index 226607c..0d13341 100644
> --- a/hw/vfio_pci.h
> +++ b/hw/vfio_pci.h
> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>  
> +int vfio_get_container_fd(struct PCIBus *pbus);
> +
>  #endif /* __VFIO_H__ */
Benjamin Herrenschmidt - July 10, 2012, 9:32 p.m.
On Tue, 2012-07-10 at 10:55 -0600, Alex Williamson wrote:
> 
> I wish you could do this through a MemoryListener like we do on x86.
> 

Can you elaborate ? TCE (iommu) manipulation on PAPR is done via
specific hypervisor calls, not sure what a MemoryListener would do
here ...

Cheers,
Ben.
Alex Williamson - July 10, 2012, 9:48 p.m.
On Wed, 2012-07-11 at 07:32 +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2012-07-10 at 10:55 -0600, Alex Williamson wrote:
> > 
> > I wish you could do this through a MemoryListener like we do on x86.
> > 
> 
> Can you elaborate ? TCE (iommu) manipulation on PAPR is done via
> specific hypervisor calls, not sure what a MemoryListener would do
> here ...

Hmm, the guest directed iommu updates via hypercalls may not really fit
the MemoryListener model.  I'm just trying to think of ways to avoid
having an offshoot of vfio in the power code base by making use of
common abstraction layers.  If we got a region_add/del callback we could
potentially move the spapr map and unmap into vfio like we do for x86.
Thanks,

Alex
Benjamin Herrenschmidt - July 10, 2012, 9:53 p.m.
On Tue, 2012-07-10 at 15:48 -0600, Alex Williamson wrote:
> > specific hypervisor calls, not sure what a MemoryListener would do
> > here ...
> 
> Hmm, the guest directed iommu updates via hypercalls may not really fit
> the MemoryListener model.  I'm just trying to think of ways to avoid
> having an offshoot of vfio in the power code base by making use of
> common abstraction layers.  If we got a region_add/del callback we could
> potentially move the spapr map and unmap into vfio like we do for x86.
> Thanks, 

In the end we don't really want to use that anyway. map and unmap are
*extremely* performance sensitive in practice, so plan is to implement
the hypercall directly in the kernel KVM at a level where it won't even
go near generic code :-)

Basically, when the hypercall gets in, we take control in what we call
"real mode" on powerpc (MMU off, translation disabled), we have a window
to implement critical stuff like this before we context switch the MMU
to the host context (which on P7 is quite expensive).

This is where I want to go directly whack the TCE table as used by the
HW (provided the page has a good PTE of course), pretty much like we do
for populating the main MMU hash table.

So the map/unmap path will be entirely in arch specific code. The one
you see in Alexey code is basically only ever going to be used by
something like qemu full emulation...

Cheers,
Ben.
Scott Wood - July 10, 2012, 10:26 p.m.
On 07/10/2012 12:51 AM, Alexey Kardashevskiy wrote:
> The patch enables VFIO on POWER.
> 
> It literally does the following:
> 
> 1. POWERPC IOMMU support (the kernel counterpart is required)
[snip]
> +/* -------- API for POWERPC IOMMU -------- */
> +
> +#define POWERPC_IOMMU           2
> +
> +struct tce_iommu_info {
> +    __u32 argsz;
> +    __u32 dma32_window_start;
> +    __u32 dma32_window_size;
> +};
> +
> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)

Is there a more specific name that could be used for this?  Not all
PowerPC chips have the same kind of IOMMU.

-Scott
Alexey Kardashevskiy - July 10, 2012, 11:55 p.m.
On 11/07/12 08:26, Scott Wood wrote:
> On 07/10/2012 12:51 AM, Alexey Kardashevskiy wrote:
>> The patch enables VFIO on POWER.
>>
>> It literally does the following:
>>
>> 1. POWERPC IOMMU support (the kernel counterpart is required)
> [snip]
>> +/* -------- API for POWERPC IOMMU -------- */
>> +
>> +#define POWERPC_IOMMU           2
>> +
>> +struct tce_iommu_info {
>> +    __u32 argsz;
>> +    __u32 dma32_window_start;
>> +    __u32 dma32_window_size;
>> +};
>> +
>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> 
> Is there a more specific name that could be used for this?  Not all
> PowerPC chips have the same kind of IOMMU.


Ben, is is SPAPR? BOOK3S?
Benjamin Herrenschmidt - July 11, 2012, 12:04 a.m.
On Wed, 2012-07-11 at 09:55 +1000, Alexey Kardashevskiy wrote:
> On 11/07/12 08:26, Scott Wood wrote:
> > On 07/10/2012 12:51 AM, Alexey Kardashevskiy wrote:
> >> The patch enables VFIO on POWER.
> >>
> >> It literally does the following:
> >>
> >> 1. POWERPC IOMMU support (the kernel counterpart is required)
> > [snip]
> >> +/* -------- API for POWERPC IOMMU -------- */
> >> +
> >> +#define POWERPC_IOMMU           2
> >> +
> >> +struct tce_iommu_info {
> >> +    __u32 argsz;
> >> +    __u32 dma32_window_start;
> >> +    __u32 dma32_window_size;
> >> +};
> >> +
> >> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> > 
> > Is there a more specific name that could be used for this?  Not all
> > PowerPC chips have the same kind of IOMMU.

> 
> Ben, is is SPAPR? BOOK3S?

So we have varieties of different iommus in the kernel indeed, we
probably want the info ioctl to reflect that. I would call this one
spapr_tce.

Also we will want a few other things, dunno if that's reflected here,
or whether the ioctl is easily extendable, but in the long run we will
need:

 - Ways to tell KVM about association between a liobn (logical bus
number as used in H_PUT_TCE) and an iommu so we can implement the real
mode H_PUT_TCE properly.

 - We will need some conduit to implement the "DDW" APIs (part of PAPR
allowing the guest to control the DMA windows, ie, by creating new
windows in 64-bit DMA space, with different page sizes etc....). So you
may want to make it clear that the above provides information about the
"base window" specifically.

Cheers,
Ben.
Alexey Kardashevskiy - July 11, 2012, 12:17 a.m.
On 11/07/12 10:04, Benjamin Herrenschmidt wrote:
> On Wed, 2012-07-11 at 09:55 +1000, Alexey Kardashevskiy wrote:
>> On 11/07/12 08:26, Scott Wood wrote:
>>> On 07/10/2012 12:51 AM, Alexey Kardashevskiy wrote:
>>>> The patch enables VFIO on POWER.
>>>>
>>>> It literally does the following:
>>>>
>>>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>> [snip]
>>>> +/* -------- API for POWERPC IOMMU -------- */
>>>> +
>>>> +#define POWERPC_IOMMU           2
>>>> +
>>>> +struct tce_iommu_info {
>>>> +    __u32 argsz;
>>>> +    __u32 dma32_window_start;
>>>> +    __u32 dma32_window_size;
>>>> +};
>>>> +
>>>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>>>
>>> Is there a more specific name that could be used for this?  Not all
>>> PowerPC chips have the same kind of IOMMU.
> 
>>
>> Ben, is is SPAPR? BOOK3S?
> 
> So we have varieties of different iommus in the kernel indeed, we
> probably want the info ioctl to reflect that. I would call this one
> spapr_tce.

VFIO provides such ioctl actually and it is it who returns POWERPC_IOMMU as an IOMMU type.

ok. SPAPR_TCE.


> Also we will want a few other things, dunno if that's reflected here,
> or whether the ioctl is easily extendable, but in the long run we will
> need:
> 
>  - Ways to tell KVM about association between a liobn (logical bus
> number as used in H_PUT_TCE) and an iommu so we can implement the real
> mode H_PUT_TCE properly.
> 
>  - We will need some conduit to implement the "DDW" APIs (part of PAPR
> allowing the guest to control the DMA windows, ie, by creating new
> windows in 64-bit DMA space, with different page sizes etc....). So you
> may want to make it clear that the above provides information about the
> "base window" specifically.

So the current one would be SPAPR_TCE_32?
Benjamin Herrenschmidt - July 11, 2012, 12:26 a.m.
On Wed, 2012-07-11 at 10:17 +1000, Alexey Kardashevskiy wrote:
> So the current one would be SPAPR_TCE_32?

No, the iommu type is SPAPR_TCE, but the *window* info you get here is
the 32-bit window. My thinking is add some versionning and a bunch of
reserved fields to that info struct so we can stick a few more things,
or at the very least add a flags field, so we can return if/when we
support DDW etc... on that iommu.

Cheers,
Ben.
Alexey Kardashevskiy - July 11, 2012, 2:54 a.m.
On 11/07/12 02:55, Alex Williamson wrote:
> On Tue, 2012-07-10 at 15:51 +1000, Alexey Kardashevskiy wrote:
>> The patch enables VFIO on POWER.
>>
>> It literally does the following:
>>
>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>
>> 2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation.
>>
>> 3. Added vfio_get_container_fd() to VFIO in order to initialize 1).
>>
>> 4. Makefile fixed and "is_vfio" flag added into sPAPR PHB - required to
>> distinguish VFIO's DMA context from the emulated one.
>>
>> WIth the pathes posted today a bit earlier, this patch fully supports
>> VFIO what includes MSIX as well,
>>
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  hw/ppc/Makefile.objs |    3 ++
>>  hw/spapr.h           |    4 +++
>>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/spapr_pci.c       |   23 ++++++++++++-
>>  hw/spapr_pci.h       |    2 ++
>>  hw/vfio_pci.c        |   76 +++++++++++++++++++++++++++++++++++++++++--
>>  hw/vfio_pci.h        |    2 ++
>>  7 files changed, 193 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>> index f573a95..c46a049 100644
>> --- a/hw/ppc/Makefile.objs
>> +++ b/hw/ppc/Makefile.objs
>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>>  # Xilinx PPC peripherals
>>  obj-y += xilinx_ethlite.o
>>  
>> +# VFIO PCI device assignment
>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>> +
>>  obj-y := $(addprefix ../,$(obj-y))
>> diff --git a/hw/spapr.h b/hw/spapr.h
>> index b37f337..9dca704 100644
>> --- a/hw/spapr.h
>> +++ b/hw/spapr.h
>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>                        DMAContext *dma);
>>  
>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size);
>> +
>>  #endif /* !defined (__HW_SPAPR_H__) */
>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
>> index 50c288d..0a194e8 100644
>> --- a/hw/spapr_iommu.c
>> +++ b/hw/spapr_iommu.c
>> @@ -16,6 +16,8 @@
>>   * You should have received a copy of the GNU Lesser General Public
>>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>>   */
>> +#include <sys/ioctl.h>
>> +
>>  #include "hw.h"
>>  #include "kvm.h"
>>  #include "qdev.h"
>> @@ -23,6 +25,7 @@
>>  #include "dma.h"
>>  
>>  #include "hw/spapr.h"
>> +#include "hw/linux-vfio.h"
> 
> I really need to move this into linux-headers.
> 
>>  
>>  #include <libfdt.h>
>>  
>> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>>      return 0;
>>  }
>>  
>> +/* -------- API for POWERPC IOMMU -------- */
>> +
>> +#define POWERPC_IOMMU           2
>> +
>> +struct tce_iommu_info {
>> +    __u32 argsz;
>> +    __u32 dma32_window_start;
>> +    __u32 dma32_window_size;
>> +};
>> +
>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +struct tce_iommu_dma_map {
>> +    __u32 argsz;
>> +    __u64 va;
>> +    __u64 dmaaddr;
>> +};
>> +
>> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
>> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> 
> I assume this would eventually go into the kernel vfio.h with a VFIO_
> prefix.  Add a flags field to the structures or it'll be hard to extend
> them later.


We can always define another type of IOMMU :) But yes, I'll extend both map and info structures.



>> +typedef struct sPAPRVFIOTable {
>> +    int fd;
>> +    uint32_t liobn;
>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
>> +} sPAPRVFIOTable;
>> +
>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
>> +
>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
>> +
>> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
>> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
>> +        return;
>> +    }
>> +    *dma32_window_start = info.dma32_window_start;
>> +    *dma32_window_size = info.dma32_window_size;
>> +
>> +    t = g_malloc0(sizeof(*t));
>> +    t->fd = fd;
>> +    t->liobn = liobn;
>> +
>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
>> +}
>> +
>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_dma_map map = {
>> +        .argsz = sizeof(map),
>> +        .va = 0,
>> +        .dmaaddr = ioba,
>> +    };
>> +
>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
>> +        if (t->liobn != liobn) {
>> +            continue;
>> +        }
>> +        if (tce) {
>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
>> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
>> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
>> +                return H_PARAMETER;
>> +            }
>> +        } else {
>> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
>> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
>> +                return H_PARAMETER;
>> +            }
>> +        }
>> +        return H_SUCCESS;
>> +    }
>> +    return H_CONTINUE; /* positive non-zero value */
>> +}
>> +
> 
> I wish you could do this through a MemoryListener like we do on x86.


What is the point? Map the entire RAM to the guest? And It will still use our own IOMMU ioctls as it
is completely our IOMMU implementaiton.


>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>                                target_ulong opcode, target_ulong *args)
>>  {
>> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>      if (0 >= ret) {
>>          return ret ? H_PARAMETER : H_SUCCESS;
>>      }
>> +    ret = put_tce_vfio(liobn, ioba, tce);
>> +    if (0 >= ret) {
>> +        return ret ? H_PARAMETER : H_SUCCESS;
>> +    }
>>  #ifdef DEBUG_TCE
>>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>> index 5f89003..3375c3f 100644
>> --- a/hw/spapr_pci.c
>> +++ b/hw/spapr_pci.c
>> @@ -29,6 +29,7 @@
>>  #include "pci_host.h"
>>  #include "hw/spapr.h"
>>  #include "hw/spapr_pci.h"
>> +#include "hw/vfio_pci.h"
>>  #include "exec-memory.h"
>>  #include <libfdt.h>
>>  #include "trace.h"
>> @@ -440,6 +441,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>>                   level);
>>  }
>>  
>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
>> +{
>> +    sPAPRPHBState *phb = opaque;
>> +    return phb->lsi_table[irq_num].dt_irq;
>> +}
>> +
>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>>                                unsigned size)
>>  {
>> @@ -567,7 +574,8 @@ static int spapr_phb_init(SysBusDevice *s)
>>  
>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>>                             phb->busname ? phb->busname : phb->dtbusname,
>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
>> +                           pci_spapr_map_irq, phb,
>>                             &phb->memspace, &phb->iospace,
>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>>      phb->host_state.bus = bus;
>> @@ -596,6 +604,7 @@ static Property spapr_phb_properties[] = {
>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>> +    DEFINE_PROP_UINT8("vfio", sPAPRPHBState, is_vfio, 0),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>  
>> @@ -639,6 +648,18 @@ void spapr_create_phb(sPAPREnvironment *spapr,
>>  /* Finalize PCI setup, called when all devices are already created */
>>  int spapr_finalize_pci_setup(sPAPRPHBState *phb)
>>  {
>> +    if (phb->is_vfio) {
>> +        int fd = vfio_get_container_fd(phb->host_state.bus);
>> +
>> +        if (fd < 0) {
>> +            return fd;
>> +        }
>> +        spapr_vfio_init_dma(fd, phb->dma_liobn,
>> +                            &phb->dma_window_start,
>> +                            &phb->dma_window_size);
>> +        return 0;
>> +    }
>> +
>>      phb->dma_window_start = 0;
>>      phb->dma_window_size = 0x40000000;
>>      phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
>> index 3aae273..a4f031b 100644
>> --- a/hw/spapr_pci.h
>> +++ b/hw/spapr_pci.h
>> @@ -57,6 +57,8 @@ typedef struct sPAPRPHBState {
>>          int nvec;
>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>>  
>> +    uint8_t is_vfio;
>> +
>>      QLIST_ENTRY(sPAPRPHBState) list;
>>  } sPAPRPHBState;
>>  
>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
>> index 1ac287f..cc0b974 100644
>> --- a/hw/vfio_pci.c
>> +++ b/hw/vfio_pci.c
>> @@ -21,7 +21,6 @@
>>  #include <dirent.h>
>>  #include <stdio.h>
>>  #include <unistd.h>
>> -#include <sys/io.h>
>>  #include <sys/ioctl.h>
>>  #include <sys/mman.h>
>>  #include <sys/types.h>
>> @@ -44,6 +43,17 @@
>>  #include "vfio_pci.h"
>>  #include "linux-vfio.h"
>>  
>> +#ifndef TARGET_PPC64
>> +#include <sys/io.h>
>> +#define VFIO_IOMMU_EXTENSION    VFIO_X86_IOMMU
>> +#else
>> +#include "hw/pci_internals.h"
>> +#include "hw/xics.h"
>> +#include "hw/spapr.h"
>> +#define POWERPC_IOMMU           2
>> +#define VFIO_IOMMU_EXTENSION    POWERPC_IOMMU
>> +#endif
>> +
> 
> VFIO_IOMMU_EXTENSION never gets used, POWER_IOMMU is redefined below.


Yes, a bit messy. Was not sure about the name so I postponed it.


>>  //#define DEBUG_VFIO
>>  #ifdef DEBUG_VFIO
>>  #define DPRINTF(fmt, ...) \
>> @@ -235,6 +245,7 @@ struct vfio_irq_set_fd {
>>  
>>  static void vfio_enable_intx_kvm(VFIODevice *vdev)
>>  {
>> +#ifndef TARGET_PPC64
> 
> Why do you need this, aren't the extension checks sufficient for this to
> be a nop for you?


It uses ioapic_remove_gsi_eoi_notifier() so it needs some #ifdef anyway. And as we do not support
kvm_irqchip_in_kernel(), there is no point in fixing it and I disabled it all.
When we make eoi notifiers a platform independent, then yes, it will be nop.


>>  #ifdef CONFIG_KVM
>>      struct vfio_irq_set_fd irq_set_fd = {
>>  	.irq_set = {
>> @@ -298,10 +309,12 @@ fail:
>>      qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
>>      vfio_unmask_intx(vdev);
>>  #endif
>> +#endif
>>  }
>>  
>>  static void vfio_disable_intx_kvm(VFIODevice *vdev)
>>  {
>> +#ifndef TARGET_PPC64
> 
> Same

Same :)

> 
>>  #ifdef CONFIG_KVM
>>      struct vfio_irq_set_fd irq_set_fd = {
>>  	.irq_set = {
>> @@ -350,8 +363,10 @@ static void vfio_disable_intx_kvm(VFIODevice *vdev)
>>      DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", __FUNCTION__,
>>              vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
>>  #endif
>> +#endif
>>  }
>>  
>> +#ifndef TARGET_PPC64
>>  static void vfio_update_irq(Notifier *notify, void *data)
>>  {
>>      VFIODevice *vdev = container_of(notify, VFIODevice, intx.update_irq);
>> @@ -381,6 +396,7 @@ static void vfio_update_irq(Notifier *notify, void *data)
>>      /* Re-enable the interrupt in cased we missed an EOI */
>>      vfio_eoi(&vdev->intx.eoi, NULL);
>>  }
>> +#endif
>>  
>>  static int vfio_enable_intx(VFIODevice *vdev)
>>  {
>> @@ -404,10 +420,14 @@ static int vfio_enable_intx(VFIODevice *vdev)
>>      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
>>      vdev->intx.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
>>      vdev->intx.eoi.notify = vfio_eoi;
>> +#ifndef TARGET_PPC64
>>      ioapic_add_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> 
> This is really only a place holder for x86 too, I don't think my eoi
> notifier as written is acceptable upstream.  We really need some common
> infrastructure here.  I'm hoping to get the kvm acceleration in place
> which would make vfio usable on x86 with kvm (the common case), then
> work towards a generic eoi notifier.
> 
>>  
>>      vdev->intx.update_irq.notify = vfio_update_irq;
>>      pci_add_irq_update_notifier(&vdev->pdev, &vdev->intx.update_irq);
> 
> Can't you stub this out to make it safe to do on POWER too?


I could even simply enable it (not sure if it is going to be called ever though but anyway) once we
get unified eoi notifiers.


>> +#else
>> +    xics_add_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
>> +#endif
>>  
>>      if (event_notifier_init(&vdev->intx.interrupt, 0)) {
>>          error_report("vfio: Error: event_notifier_init failed\n");
>> @@ -440,8 +460,12 @@ static void vfio_disable_intx(VFIODevice *vdev)
>>      vfio_disable_intx_kvm(vdev);
>>      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
>>  
>> +#ifndef TARGET_PPC64
>>      pci_remove_irq_update_notifier(&vdev->intx.update_irq);
>>      ioapic_remove_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
>> +#else
>> +    xics_remove_eoi_notifier(&vdev->intx.eoi);
>> +#endif
>>  
>>      fd = event_notifier_get_fd(&vdev->intx.interrupt);
>>      qemu_set_fd_handler(fd, NULL, NULL, vdev);
>> @@ -543,7 +567,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>>      }
>>  
>>      fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
>> -
>> +#ifndef TARGET_PPC64
>>      vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>>      if (vdev->msi_vectors[vector].virq < 0 || 
>>          kvm_irqchip_add_irqfd(kvm_state, fd,
>> @@ -551,7 +575,11 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>>          qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>>                              &vdev->msi_vectors[vector]);
>>      }
>> -
>> +#else
>> +    vdev->msi_vectors[vector].virq = -1;
>> +    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>> +                        &vdev->msi_vectors[vector]);
>> +#endif
> 
> This shouldn't be necessary once the abort is removed from
> kvm_irqchip_add_msi_route.  It'll be merged next time the kvm uq tree
> merges into qemu.


True, I just did not pick up your very last changes. Updating is always painful, and now it is even
worse then usual as pci_get_irq has been renamed to something else :) Will do though.



>>      if (vdev->nr_vectors < vector + 1) {
>>          int i;
>>  
>> @@ -692,6 +720,7 @@ retry:
>>          fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
>>  
>>          msg = msi_get_msg(&vdev->pdev, i);
>> +#ifndef TARGET_PPC64
>>          vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>>          if (vdev->msi_vectors[i].virq < 0 || 
>>              kvm_irqchip_add_irqfd(kvm_state, fd,
>> @@ -699,6 +728,12 @@ retry:
>>              qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>>                                  &vdev->msi_vectors[i]);
>>          }
>> +#else
>> +        vdev->msi_vectors[i].virq = -1;
>> +        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>> +                            &vdev->msi_vectors[i]);
>> +        msg = msg;
>> +#endif
> 
> Same here
> 
>>      }
>>      
>>      ret = vfio_enable_vectors(vdev, false);
>> @@ -1581,6 +1616,25 @@ static int vfio_connect_container(VFIOGroup *group)
>>  
>>          memory_listener_register(&container->listener, get_system_memory());
>>  
>> +#define POWERPC_IOMMU           2
> 
> Assume this will go in the kernel vfio.h at some point.  You may want to
> pick a different name if there's a possibility of other powerpc iommu
> implementations... thus the crappy type1 name for x86.
> 
>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>> +        if (ret) {
>> +            error_report("vfio: failed to set group container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>> +
>> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
>> +        if (ret) {
>> +            error_report("vfio: failed to set iommu for container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>>      } else {
>>          error_report("vfio: No available IOMMU models\n");
>>          g_free(container);
>> @@ -2005,3 +2059,19 @@ static void register_vfio_pci_dev_type(void)
>>  }
>>  
>>  type_init(register_vfio_pci_dev_type)
>> +
>> +int vfio_get_container_fd(struct PCIBus *pbus)
>> +{
>> +    BusChild *kid1st = QTAILQ_FIRST(&pbus->qbus.children);
>> +    VFIODevice *vdev1st;
>> +
>> +    if (!kid1st) {
>> +        printf("No device registered on PCI bus \"%s\", no DMA enabled\n",
>> +               pbus->qbus.name);
>> +        return -1;
>> +    }
>> +    vdev1st = container_of(kid1st->child, VFIODevice, pdev.qdev);
>> +
>> +    return vdev1st->group->container->fd;
>> +}
>> +
> 
> This is not a generic implementation.  x86 won't have all devices on a
> bus be vfio devices and even if it did, there's no guarantee they all
> belong to the same container.  This should probably at least take a
> PCIDevice and some kind of POWER specific code will need to know that
> the container is the same for the whole bus.  Thanks,


This is a workaround, true. x86 does not need this call at all. And on powerpc VFIO devices won't
share PCI bus with emulated devices. I just need some API to get this fd.

Well I probably can add MemoryListener for the DMA window and move all power-specific map/unmap code
to VFIO but it does not look much better. I would rather prefer separating IOMMU code from vfio_pci
somehow (more or less as it is now for powerpc). While doing it, we could think of the API to get
this fd which we need anyway in order to setup the DMA window which is per group (which QEMU does
not understand) but not per device.



> Alex
> 
>> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
>> index 226607c..0d13341 100644
>> --- a/hw/vfio_pci.h
>> +++ b/hw/vfio_pci.h
>> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>>  
>> +int vfio_get_container_fd(struct PCIBus *pbus);
>> +
>>  #endif /* __VFIO_H__ */
> 
> 
>
Benjamin Herrenschmidt - July 11, 2012, 3:10 a.m.
On Wed, 2012-07-11 at 12:54 +1000, Alexey Kardashevskiy wrote:
> > Why do you need this, aren't the extension checks sufficient for this to
> > be a nop for you?
> 
> 
> It uses ioapic_remove_gsi_eoi_notifier() so it needs some #ifdef anyway. And as we do not support
> kvm_irqchip_in_kernel(), there is no point in fixing it and I disabled it all.
> When we make eoi notifiers a platform independent, then yes, it will be nop.

In fact we have an internal experimental patch to move our
PIC emulation into the kernel but so far I have not managed
to make it fit in the existing irqchip stuff.

That irqchip interface is nasty. It's completely x86 centric,
and have tendrils all over the place, into the msi code, into
devices (virtio-pci.c) etc... in ways that are essentially unusable for
anything that looks a bit different.

IE. In urgent need of refactoring.

> >> -
> >> +#ifndef TARGET_PPC64
> >>      vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >>      if (vdev->msi_vectors[vector].virq < 0 || 
> >>          kvm_irqchip_add_irqfd(kvm_state, fd,
> >> @@ -551,7 +575,11 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
> >>          qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> >>                              &vdev->msi_vectors[vector]);
> >>      }
> >> -
> >> +#else
> >> +    vdev->msi_vectors[vector].virq = -1;
> >> +    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> >> +                        &vdev->msi_vectors[vector]);
> >> +#endif
> > 
> > This shouldn't be necessary once the abort is removed from
> > kvm_irqchip_add_msi_route.  It'll be merged next time the kvm uq tree
> > merges into qemu.

It must also return an irq number, what to chose in that case ? Ie. the
whole irqchip API is a trainwreck if you ask me :-) Very poorly thought
out.

> True, I just did not pick up your very last changes. Updating is always painful, and now it is even
> worse then usual as pci_get_irq has been renamed to something else :) Will do though.

 .../...

> Well I probably can add MemoryListener for the DMA window and move all power-specific map/unmap code
> to VFIO but it does not look much better. I would rather prefer separating IOMMU code from vfio_pci
> somehow (more or less as it is now for powerpc). While doing it, we could think of the API to get
> this fd which we need anyway in order to setup the DMA window which is per group (which QEMU does
> not understand) but not per device.

Right. What we really want is our own private iommu interface based on
the iommu type. Each iommu will have it's own "quirks" in that regard
anyways.

Cheers,
Ben.

> 
> 
> > Alex
> > 
> >> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> >> index 226607c..0d13341 100644
> >> --- a/hw/vfio_pci.h
> >> +++ b/hw/vfio_pci.h
> >> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
> >>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
> >>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
> >>  
> >> +int vfio_get_container_fd(struct PCIBus *pbus);
> >> +
> >>  #endif /* __VFIO_H__ */
> > 
> > 
> > 
> 
> 
> -- 
> Alexey
> 
> 
>
Alex Williamson - July 12, 2012, 3:11 a.m.
On Wed, 2012-07-11 at 12:54 +1000, Alexey Kardashevskiy wrote:
> On 11/07/12 02:55, Alex Williamson wrote:
> > On Tue, 2012-07-10 at 15:51 +1000, Alexey Kardashevskiy wrote:
> >> The patch enables VFIO on POWER.
> >>
> >> It literally does the following:
> >>
> >> 1. POWERPC IOMMU support (the kernel counterpart is required)
> >>
> >> 2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation.
> >>
> >> 3. Added vfio_get_container_fd() to VFIO in order to initialize 1).
> >>
> >> 4. Makefile fixed and "is_vfio" flag added into sPAPR PHB - required to
> >> distinguish VFIO's DMA context from the emulated one.
> >>
> >> WIth the pathes posted today a bit earlier, this patch fully supports
> >> VFIO what includes MSIX as well,
> >>
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>  hw/ppc/Makefile.objs |    3 ++
> >>  hw/spapr.h           |    4 +++
> >>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  hw/spapr_pci.c       |   23 ++++++++++++-
> >>  hw/spapr_pci.h       |    2 ++
> >>  hw/vfio_pci.c        |   76 +++++++++++++++++++++++++++++++++++++++++--
> >>  hw/vfio_pci.h        |    2 ++
> >>  7 files changed, 193 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >> index f573a95..c46a049 100644
> >> --- a/hw/ppc/Makefile.objs
> >> +++ b/hw/ppc/Makefile.objs
> >> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
> >>  # Xilinx PPC peripherals
> >>  obj-y += xilinx_ethlite.o
> >>  
> >> +# VFIO PCI device assignment
> >> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> >> +
> >>  obj-y := $(addprefix ../,$(obj-y))
> >> diff --git a/hw/spapr.h b/hw/spapr.h
> >> index b37f337..9dca704 100644
> >> --- a/hw/spapr.h
> >> +++ b/hw/spapr.h
> >> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> >>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
> >>                        DMAContext *dma);
> >>  
> >> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> >> +                         uint64_t *dma32_window_start,
> >> +                         uint64_t *dma32_window_size);
> >> +
> >>  #endif /* !defined (__HW_SPAPR_H__) */
> >> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> >> index 50c288d..0a194e8 100644
> >> --- a/hw/spapr_iommu.c
> >> +++ b/hw/spapr_iommu.c
> >> @@ -16,6 +16,8 @@
> >>   * You should have received a copy of the GNU Lesser General Public
> >>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> >>   */
> >> +#include <sys/ioctl.h>
> >> +
> >>  #include "hw.h"
> >>  #include "kvm.h"
> >>  #include "qdev.h"
> >> @@ -23,6 +25,7 @@
> >>  #include "dma.h"
> >>  
> >>  #include "hw/spapr.h"
> >> +#include "hw/linux-vfio.h"
> > 
> > I really need to move this into linux-headers.
> > 
> >>  
> >>  #include <libfdt.h>
> >>  
> >> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
> >>      return 0;
> >>  }
> >>  
> >> +/* -------- API for POWERPC IOMMU -------- */
> >> +
> >> +#define POWERPC_IOMMU           2
> >> +
> >> +struct tce_iommu_info {
> >> +    __u32 argsz;
> >> +    __u32 dma32_window_start;
> >> +    __u32 dma32_window_size;
> >> +};
> >> +
> >> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> >> +
> >> +struct tce_iommu_dma_map {
> >> +    __u32 argsz;
> >> +    __u64 va;
> >> +    __u64 dmaaddr;
> >> +};
> >> +
> >> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
> >> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> > 
> > I assume this would eventually go into the kernel vfio.h with a VFIO_
> > prefix.  Add a flags field to the structures or it'll be hard to extend
> > them later.
> 
> 
> We can always define another type of IOMMU :) But yes, I'll extend both map and info structures.
> 
> 
> 
> >> +typedef struct sPAPRVFIOTable {
> >> +    int fd;
> >> +    uint32_t liobn;
> >> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> >> +} sPAPRVFIOTable;
> >> +
> >> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> >> +
> >> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> >> +                         uint64_t *dma32_window_start,
> >> +                         uint64_t *dma32_window_size)
> >> +{
> >> +    sPAPRVFIOTable *t;
> >> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> >> +
> >> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
> >> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
> >> +        return;
> >> +    }
> >> +    *dma32_window_start = info.dma32_window_start;
> >> +    *dma32_window_size = info.dma32_window_size;
> >> +
> >> +    t = g_malloc0(sizeof(*t));
> >> +    t->fd = fd;
> >> +    t->liobn = liobn;
> >> +
> >> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> >> +}
> >> +
> >> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> >> +{
> >> +    sPAPRVFIOTable *t;
> >> +    struct tce_iommu_dma_map map = {
> >> +        .argsz = sizeof(map),
> >> +        .va = 0,
> >> +        .dmaaddr = ioba,
> >> +    };
> >> +
> >> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> >> +        if (t->liobn != liobn) {
> >> +            continue;
> >> +        }
> >> +        if (tce) {
> >> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> >> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
> >> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
> >> +                return H_PARAMETER;
> >> +            }
> >> +        } else {
> >> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
> >> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
> >> +                return H_PARAMETER;
> >> +            }
> >> +        }
> >> +        return H_SUCCESS;
> >> +    }
> >> +    return H_CONTINUE; /* positive non-zero value */
> >> +}
> >> +
> > 
> > I wish you could do this through a MemoryListener like we do on x86.
> 
> 
> What is the point? Map the entire RAM to the guest? And It will still use our own IOMMU ioctls as it
> is completely our IOMMU implementaiton.

Yeah, with Ben's explanation it's probably not worth the effort.  We
might want to consider putting stuff like this in logical vfio-arch
files though (vfio-spapr, vfio-x86, vfio-x86-kvm, etc).

> >>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
> >>                                target_ulong opcode, target_ulong *args)
> >>  {
> >> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
> >>      if (0 >= ret) {
> >>          return ret ? H_PARAMETER : H_SUCCESS;
> >>      }
> >> +    ret = put_tce_vfio(liobn, ioba, tce);
> >> +    if (0 >= ret) {
> >> +        return ret ? H_PARAMETER : H_SUCCESS;
> >> +    }
> >>  #ifdef DEBUG_TCE
> >>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
> >>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
> >> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> >> index 5f89003..3375c3f 100644
> >> --- a/hw/spapr_pci.c
> >> +++ b/hw/spapr_pci.c
> >> @@ -29,6 +29,7 @@
> >>  #include "pci_host.h"
> >>  #include "hw/spapr.h"
> >>  #include "hw/spapr_pci.h"
> >> +#include "hw/vfio_pci.h"
> >>  #include "exec-memory.h"
> >>  #include <libfdt.h>
> >>  #include "trace.h"
> >> @@ -440,6 +441,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
> >>                   level);
> >>  }
> >>  
> >> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> >> +{
> >> +    sPAPRPHBState *phb = opaque;
> >> +    return phb->lsi_table[irq_num].dt_irq;
> >> +}
> >> +
> >>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
> >>                                unsigned size)
> >>  {
> >> @@ -567,7 +574,8 @@ static int spapr_phb_init(SysBusDevice *s)
> >>  
> >>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
> >>                             phb->busname ? phb->busname : phb->dtbusname,
> >> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> >> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> >> +                           pci_spapr_map_irq, phb,
> >>                             &phb->memspace, &phb->iospace,
> >>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
> >>      phb->host_state.bus = bus;
> >> @@ -596,6 +604,7 @@ static Property spapr_phb_properties[] = {
> >>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
> >>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
> >>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> >> +    DEFINE_PROP_UINT8("vfio", sPAPRPHBState, is_vfio, 0),
> >>      DEFINE_PROP_END_OF_LIST(),
> >>  };
> >>  
> >> @@ -639,6 +648,18 @@ void spapr_create_phb(sPAPREnvironment *spapr,
> >>  /* Finalize PCI setup, called when all devices are already created */
> >>  int spapr_finalize_pci_setup(sPAPRPHBState *phb)
> >>  {
> >> +    if (phb->is_vfio) {
> >> +        int fd = vfio_get_container_fd(phb->host_state.bus);
> >> +
> >> +        if (fd < 0) {
> >> +            return fd;
> >> +        }
> >> +        spapr_vfio_init_dma(fd, phb->dma_liobn,
> >> +                            &phb->dma_window_start,
> >> +                            &phb->dma_window_size);
> >> +        return 0;
> >> +    }
> >> +
> >>      phb->dma_window_start = 0;
> >>      phb->dma_window_size = 0x40000000;
> >>      phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> >> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> >> index 3aae273..a4f031b 100644
> >> --- a/hw/spapr_pci.h
> >> +++ b/hw/spapr_pci.h
> >> @@ -57,6 +57,8 @@ typedef struct sPAPRPHBState {
> >>          int nvec;
> >>      } msi_table[SPAPR_MSIX_MAX_DEVS];
> >>  
> >> +    uint8_t is_vfio;
> >> +
> >>      QLIST_ENTRY(sPAPRPHBState) list;
> >>  } sPAPRPHBState;
> >>  
> >> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> >> index 1ac287f..cc0b974 100644
> >> --- a/hw/vfio_pci.c
> >> +++ b/hw/vfio_pci.c
> >> @@ -21,7 +21,6 @@
> >>  #include <dirent.h>
> >>  #include <stdio.h>
> >>  #include <unistd.h>
> >> -#include <sys/io.h>
> >>  #include <sys/ioctl.h>
> >>  #include <sys/mman.h>
> >>  #include <sys/types.h>
> >> @@ -44,6 +43,17 @@
> >>  #include "vfio_pci.h"
> >>  #include "linux-vfio.h"
> >>  
> >> +#ifndef TARGET_PPC64
> >> +#include <sys/io.h>
> >> +#define VFIO_IOMMU_EXTENSION    VFIO_X86_IOMMU
> >> +#else
> >> +#include "hw/pci_internals.h"
> >> +#include "hw/xics.h"
> >> +#include "hw/spapr.h"
> >> +#define POWERPC_IOMMU           2
> >> +#define VFIO_IOMMU_EXTENSION    POWERPC_IOMMU
> >> +#endif
> >> +
> > 
> > VFIO_IOMMU_EXTENSION never gets used, POWER_IOMMU is redefined below.
> 
> 
> Yes, a bit messy. Was not sure about the name so I postponed it.
> 
> 
> >>  //#define DEBUG_VFIO
> >>  #ifdef DEBUG_VFIO
> >>  #define DPRINTF(fmt, ...) \
> >> @@ -235,6 +245,7 @@ struct vfio_irq_set_fd {
> >>  
> >>  static void vfio_enable_intx_kvm(VFIODevice *vdev)
> >>  {
> >> +#ifndef TARGET_PPC64
> > 
> > Why do you need this, aren't the extension checks sufficient for this to
> > be a nop for you?
> 
> 
> It uses ioapic_remove_gsi_eoi_notifier() so it needs some #ifdef anyway. And as we do not support
> kvm_irqchip_in_kernel(), there is no point in fixing it and I disabled it all.
> When we make eoi notifiers a platform independent, then yes, it will be nop.

Ah right, forgot you won't even build ioapic_*.

> >>  #ifdef CONFIG_KVM
> >>      struct vfio_irq_set_fd irq_set_fd = {
> >>  	.irq_set = {
> >> @@ -298,10 +309,12 @@ fail:
> >>      qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
> >>      vfio_unmask_intx(vdev);
> >>  #endif
> >> +#endif
> >>  }
> >>  
> >>  static void vfio_disable_intx_kvm(VFIODevice *vdev)
> >>  {
> >> +#ifndef TARGET_PPC64
> > 
> > Same
> 
> Same :)
> 
> > 
> >>  #ifdef CONFIG_KVM
> >>      struct vfio_irq_set_fd irq_set_fd = {
> >>  	.irq_set = {
> >> @@ -350,8 +363,10 @@ static void vfio_disable_intx_kvm(VFIODevice *vdev)
> >>      DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", __FUNCTION__,
> >>              vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
> >>  #endif
> >> +#endif
> >>  }
> >>  
> >> +#ifndef TARGET_PPC64
> >>  static void vfio_update_irq(Notifier *notify, void *data)
> >>  {
> >>      VFIODevice *vdev = container_of(notify, VFIODevice, intx.update_irq);
> >> @@ -381,6 +396,7 @@ static void vfio_update_irq(Notifier *notify, void *data)
> >>      /* Re-enable the interrupt in cased we missed an EOI */
> >>      vfio_eoi(&vdev->intx.eoi, NULL);
> >>  }
> >> +#endif
> >>  
> >>  static int vfio_enable_intx(VFIODevice *vdev)
> >>  {
> >> @@ -404,10 +420,14 @@ static int vfio_enable_intx(VFIODevice *vdev)
> >>      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
> >>      vdev->intx.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
> >>      vdev->intx.eoi.notify = vfio_eoi;
> >> +#ifndef TARGET_PPC64
> >>      ioapic_add_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> > 
> > This is really only a place holder for x86 too, I don't think my eoi
> > notifier as written is acceptable upstream.  We really need some common
> > infrastructure here.  I'm hoping to get the kvm acceleration in place
> > which would make vfio usable on x86 with kvm (the common case), then
> > work towards a generic eoi notifier.
> > 
> >>  
> >>      vdev->intx.update_irq.notify = vfio_update_irq;
> >>      pci_add_irq_update_notifier(&vdev->pdev, &vdev->intx.update_irq);
> > 
> > Can't you stub this out to make it safe to do on POWER too?
> 
> 
> I could even simply enable it (not sure if it is going to be called ever though but anyway) once we
> get unified eoi notifiers.

Right

> >> +#else
> >> +    xics_add_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> >> +#endif
> >>  
> >>      if (event_notifier_init(&vdev->intx.interrupt, 0)) {
> >>          error_report("vfio: Error: event_notifier_init failed\n");
> >> @@ -440,8 +460,12 @@ static void vfio_disable_intx(VFIODevice *vdev)
> >>      vfio_disable_intx_kvm(vdev);
> >>      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
> >>  
> >> +#ifndef TARGET_PPC64
> >>      pci_remove_irq_update_notifier(&vdev->intx.update_irq);
> >>      ioapic_remove_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
> >> +#else
> >> +    xics_remove_eoi_notifier(&vdev->intx.eoi);
> >> +#endif
> >>  
> >>      fd = event_notifier_get_fd(&vdev->intx.interrupt);
> >>      qemu_set_fd_handler(fd, NULL, NULL, vdev);
> >> @@ -543,7 +567,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
> >>      }
> >>  
> >>      fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> >> -
> >> +#ifndef TARGET_PPC64
> >>      vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >>      if (vdev->msi_vectors[vector].virq < 0 || 
> >>          kvm_irqchip_add_irqfd(kvm_state, fd,
> >> @@ -551,7 +575,11 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
> >>          qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> >>                              &vdev->msi_vectors[vector]);
> >>      }
> >> -
> >> +#else
> >> +    vdev->msi_vectors[vector].virq = -1;
> >> +    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> >> +                        &vdev->msi_vectors[vector]);
> >> +#endif
> > 
> > This shouldn't be necessary once the abort is removed from
> > kvm_irqchip_add_msi_route.  It'll be merged next time the kvm uq tree
> > merges into qemu.
> 
> 
> True, I just did not pick up your very last changes. Updating is always painful, and now it is even
> worse then usual as pci_get_irq has been renamed to something else :) Will do though.

Yep, I think once Michael is back from holiday and does a pull request
(and hopefully merges Jan's PCIBus irq routing patches) my tree will be
down to mostly just the vfio driver and I'll start managing it like the
kernel tree with a patch series that gets rebased.  I'm hoping that if I
can get an acceptable level irqfd/eoifd implementation for x86 kvm in
the kernel that I can rip out the ioapic eoi notifiers and submit the
code as functional only with kvm and work out the generic eoi notifiers
in qemu proper.

> >>      if (vdev->nr_vectors < vector + 1) {
> >>          int i;
> >>  
> >> @@ -692,6 +720,7 @@ retry:
> >>          fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> >>  
> >>          msg = msi_get_msg(&vdev->pdev, i);
> >> +#ifndef TARGET_PPC64
> >>          vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> >>          if (vdev->msi_vectors[i].virq < 0 || 
> >>              kvm_irqchip_add_irqfd(kvm_state, fd,
> >> @@ -699,6 +728,12 @@ retry:
> >>              qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> >>                                  &vdev->msi_vectors[i]);
> >>          }
> >> +#else
> >> +        vdev->msi_vectors[i].virq = -1;
> >> +        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> >> +                            &vdev->msi_vectors[i]);
> >> +        msg = msg;
> >> +#endif
> > 
> > Same here
> > 
> >>      }
> >>      
> >>      ret = vfio_enable_vectors(vdev, false);
> >> @@ -1581,6 +1616,25 @@ static int vfio_connect_container(VFIOGroup *group)
> >>  
> >>          memory_listener_register(&container->listener, get_system_memory());
> >>  
> >> +#define POWERPC_IOMMU           2
> > 
> > Assume this will go in the kernel vfio.h at some point.  You may want to
> > pick a different name if there's a possibility of other powerpc iommu
> > implementations... thus the crappy type1 name for x86.
> > 
> >> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
> >> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> >> +        if (ret) {
> >> +            error_report("vfio: failed to set group container: %s\n",
> >> +                         strerror(errno));
> >> +            g_free(container);
> >> +            close(fd);
> >> +            return -1;
> >> +        }
> >> +
> >> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
> >> +        if (ret) {
> >> +            error_report("vfio: failed to set iommu for container: %s\n",
> >> +                         strerror(errno));
> >> +            g_free(container);
> >> +            close(fd);
> >> +            return -1;
> >> +        }
> >>      } else {
> >>          error_report("vfio: No available IOMMU models\n");
> >>          g_free(container);
> >> @@ -2005,3 +2059,19 @@ static void register_vfio_pci_dev_type(void)
> >>  }
> >>  
> >>  type_init(register_vfio_pci_dev_type)
> >> +
> >> +int vfio_get_container_fd(struct PCIBus *pbus)
> >> +{
> >> +    BusChild *kid1st = QTAILQ_FIRST(&pbus->qbus.children);
> >> +    VFIODevice *vdev1st;
> >> +
> >> +    if (!kid1st) {
> >> +        printf("No device registered on PCI bus \"%s\", no DMA enabled\n",
> >> +               pbus->qbus.name);
> >> +        return -1;
> >> +    }
> >> +    vdev1st = container_of(kid1st->child, VFIODevice, pdev.qdev);
> >> +
> >> +    return vdev1st->group->container->fd;
> >> +}
> >> +
> > 
> > This is not a generic implementation.  x86 won't have all devices on a
> > bus be vfio devices and even if it did, there's no guarantee they all
> > belong to the same container.  This should probably at least take a
> > PCIDevice and some kind of POWER specific code will need to know that
> > the container is the same for the whole bus.  Thanks,
> 
> 
> This is a workaround, true. x86 does not need this call at all. And on powerpc VFIO devices won't
> share PCI bus with emulated devices. I just need some API to get this fd.
> 
> Well I probably can add MemoryListener for the DMA window and move all power-specific map/unmap code
> to VFIO but it does not look much better. I would rather prefer separating IOMMU code from vfio_pci
> somehow (more or less as it is now for powerpc). While doing it, we could think of the API to get
> this fd which we need anyway in order to setup the DMA window which is per group (which QEMU does
> not understand) but not per device.

The MemoryListener probably doesn't make sense with a guest driven iova
window.  It would be an abuse of the interface I think.  At some level
in the power code you, or at least the user, needs to know about groups
though.  That's how you end up with an emulated bridge in front of each
group, right?  So with that same knowledge, shouldn't the API simply be:

int vfio_get_container_fd(PCIDevice *dev)

where power code picks a device from the bus since you know they're all
in the same group?  Thanks,

Alex
Alexey Kardashevskiy - July 12, 2012, 8:47 a.m.
On 12/07/12 13:11, Alex Williamson wrote:
> On Wed, 2012-07-11 at 12:54 +1000, Alexey Kardashevskiy wrote:
>> On 11/07/12 02:55, Alex Williamson wrote:
>>> On Tue, 2012-07-10 at 15:51 +1000, Alexey Kardashevskiy wrote:
>>>> The patch enables VFIO on POWER.
>>>>
>>>> It literally does the following:
>>>>
>>>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>>>
>>>> 2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation.
>>>>
>>>> 3. Added vfio_get_container_fd() to VFIO in order to initialize 1).
>>>>
>>>> 4. Makefile fixed and "is_vfio" flag added into sPAPR PHB - required to
>>>> distinguish VFIO's DMA context from the emulated one.
>>>>
>>>> WIth the pathes posted today a bit earlier, this patch fully supports
>>>> VFIO what includes MSIX as well,
>>>>
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>  hw/ppc/Makefile.objs |    3 ++
>>>>  hw/spapr.h           |    4 +++
>>>>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  hw/spapr_pci.c       |   23 ++++++++++++-
>>>>  hw/spapr_pci.h       |    2 ++
>>>>  hw/vfio_pci.c        |   76 +++++++++++++++++++++++++++++++++++++++++--
>>>>  hw/vfio_pci.h        |    2 ++
>>>>  7 files changed, 193 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>>>> index f573a95..c46a049 100644
>>>> --- a/hw/ppc/Makefile.objs
>>>> +++ b/hw/ppc/Makefile.objs
>>>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>>>>  # Xilinx PPC peripherals
>>>>  obj-y += xilinx_ethlite.o
>>>>  
>>>> +# VFIO PCI device assignment
>>>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>>>> +
>>>>  obj-y := $(addprefix ../,$(obj-y))
>>>> diff --git a/hw/spapr.h b/hw/spapr.h
>>>> index b37f337..9dca704 100644
>>>> --- a/hw/spapr.h
>>>> +++ b/hw/spapr.h
>>>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>>>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>>>                        DMAContext *dma);
>>>>  
>>>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>>>> +                         uint64_t *dma32_window_start,
>>>> +                         uint64_t *dma32_window_size);
>>>> +
>>>>  #endif /* !defined (__HW_SPAPR_H__) */
>>>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
>>>> index 50c288d..0a194e8 100644
>>>> --- a/hw/spapr_iommu.c
>>>> +++ b/hw/spapr_iommu.c
>>>> @@ -16,6 +16,8 @@
>>>>   * You should have received a copy of the GNU Lesser General Public
>>>>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>>>>   */
>>>> +#include <sys/ioctl.h>
>>>> +
>>>>  #include "hw.h"
>>>>  #include "kvm.h"
>>>>  #include "qdev.h"
>>>> @@ -23,6 +25,7 @@
>>>>  #include "dma.h"
>>>>  
>>>>  #include "hw/spapr.h"
>>>> +#include "hw/linux-vfio.h"
>>>
>>> I really need to move this into linux-headers.
>>>
>>>>  
>>>>  #include <libfdt.h>
>>>>  
>>>> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>>>>      return 0;
>>>>  }
>>>>  
>>>> +/* -------- API for POWERPC IOMMU -------- */
>>>> +
>>>> +#define POWERPC_IOMMU           2
>>>> +
>>>> +struct tce_iommu_info {
>>>> +    __u32 argsz;
>>>> +    __u32 dma32_window_start;
>>>> +    __u32 dma32_window_size;
>>>> +};
>>>> +
>>>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>>>> +
>>>> +struct tce_iommu_dma_map {
>>>> +    __u32 argsz;
>>>> +    __u64 va;
>>>> +    __u64 dmaaddr;
>>>> +};
>>>> +
>>>> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
>>>> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>>
>>> I assume this would eventually go into the kernel vfio.h with a VFIO_
>>> prefix.  Add a flags field to the structures or it'll be hard to extend
>>> them later.
>>
>>
>> We can always define another type of IOMMU :) But yes, I'll extend both map and info structures.
>>
>>
>>
>>>> +typedef struct sPAPRVFIOTable {
>>>> +    int fd;
>>>> +    uint32_t liobn;
>>>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
>>>> +} sPAPRVFIOTable;
>>>> +
>>>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
>>>> +
>>>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>>>> +                         uint64_t *dma32_window_start,
>>>> +                         uint64_t *dma32_window_size)
>>>> +{
>>>> +    sPAPRVFIOTable *t;
>>>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
>>>> +
>>>> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
>>>> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
>>>> +        return;
>>>> +    }
>>>> +    *dma32_window_start = info.dma32_window_start;
>>>> +    *dma32_window_size = info.dma32_window_size;
>>>> +
>>>> +    t = g_malloc0(sizeof(*t));
>>>> +    t->fd = fd;
>>>> +    t->liobn = liobn;
>>>> +
>>>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
>>>> +}
>>>> +
>>>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
>>>> +{
>>>> +    sPAPRVFIOTable *t;
>>>> +    struct tce_iommu_dma_map map = {
>>>> +        .argsz = sizeof(map),
>>>> +        .va = 0,
>>>> +        .dmaaddr = ioba,
>>>> +    };
>>>> +
>>>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
>>>> +        if (t->liobn != liobn) {
>>>> +            continue;
>>>> +        }
>>>> +        if (tce) {
>>>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
>>>> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
>>>> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
>>>> +                return H_PARAMETER;
>>>> +            }
>>>> +        } else {
>>>> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
>>>> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
>>>> +                return H_PARAMETER;
>>>> +            }
>>>> +        }
>>>> +        return H_SUCCESS;
>>>> +    }
>>>> +    return H_CONTINUE; /* positive non-zero value */
>>>> +}
>>>> +
>>>
>>> I wish you could do this through a MemoryListener like we do on x86.
>>
>>
>> What is the point? Map the entire RAM to the guest? And It will still use our own IOMMU ioctls as it
>> is completely our IOMMU implementaiton.
> 
> Yeah, with Ben's explanation it's probably not worth the effort.  We
> might want to consider putting stuff like this in logical vfio-arch
> files though (vfio-spapr, vfio-x86, vfio-x86-kvm, etc).
> 
>>>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>>>                                target_ulong opcode, target_ulong *args)
>>>>  {
>>>> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>>>      if (0 >= ret) {
>>>>          return ret ? H_PARAMETER : H_SUCCESS;
>>>>      }
>>>> +    ret = put_tce_vfio(liobn, ioba, tce);
>>>> +    if (0 >= ret) {
>>>> +        return ret ? H_PARAMETER : H_SUCCESS;
>>>> +    }
>>>>  #ifdef DEBUG_TCE
>>>>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>>>>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
>>>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>>>> index 5f89003..3375c3f 100644
>>>> --- a/hw/spapr_pci.c
>>>> +++ b/hw/spapr_pci.c
>>>> @@ -29,6 +29,7 @@
>>>>  #include "pci_host.h"
>>>>  #include "hw/spapr.h"
>>>>  #include "hw/spapr_pci.h"
>>>> +#include "hw/vfio_pci.h"
>>>>  #include "exec-memory.h"
>>>>  #include <libfdt.h>
>>>>  #include "trace.h"
>>>> @@ -440,6 +441,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>>>>                   level);
>>>>  }
>>>>  
>>>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
>>>> +{
>>>> +    sPAPRPHBState *phb = opaque;
>>>> +    return phb->lsi_table[irq_num].dt_irq;
>>>> +}
>>>> +
>>>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>>>>                                unsigned size)
>>>>  {
>>>> @@ -567,7 +574,8 @@ static int spapr_phb_init(SysBusDevice *s)
>>>>  
>>>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>>>>                             phb->busname ? phb->busname : phb->dtbusname,
>>>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
>>>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
>>>> +                           pci_spapr_map_irq, phb,
>>>>                             &phb->memspace, &phb->iospace,
>>>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>>>>      phb->host_state.bus = bus;
>>>> @@ -596,6 +604,7 @@ static Property spapr_phb_properties[] = {
>>>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>>>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>>>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>>>> +    DEFINE_PROP_UINT8("vfio", sPAPRPHBState, is_vfio, 0),
>>>>      DEFINE_PROP_END_OF_LIST(),
>>>>  };
>>>>  
>>>> @@ -639,6 +648,18 @@ void spapr_create_phb(sPAPREnvironment *spapr,
>>>>  /* Finalize PCI setup, called when all devices are already created */
>>>>  int spapr_finalize_pci_setup(sPAPRPHBState *phb)
>>>>  {
>>>> +    if (phb->is_vfio) {
>>>> +        int fd = vfio_get_container_fd(phb->host_state.bus);
>>>> +
>>>> +        if (fd < 0) {
>>>> +            return fd;
>>>> +        }
>>>> +        spapr_vfio_init_dma(fd, phb->dma_liobn,
>>>> +                            &phb->dma_window_start,
>>>> +                            &phb->dma_window_size);
>>>> +        return 0;
>>>> +    }
>>>> +
>>>>      phb->dma_window_start = 0;
>>>>      phb->dma_window_size = 0x40000000;
>>>>      phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
>>>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
>>>> index 3aae273..a4f031b 100644
>>>> --- a/hw/spapr_pci.h
>>>> +++ b/hw/spapr_pci.h
>>>> @@ -57,6 +57,8 @@ typedef struct sPAPRPHBState {
>>>>          int nvec;
>>>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>>>>  
>>>> +    uint8_t is_vfio;
>>>> +
>>>>      QLIST_ENTRY(sPAPRPHBState) list;
>>>>  } sPAPRPHBState;
>>>>  
>>>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
>>>> index 1ac287f..cc0b974 100644
>>>> --- a/hw/vfio_pci.c
>>>> +++ b/hw/vfio_pci.c
>>>> @@ -21,7 +21,6 @@
>>>>  #include <dirent.h>
>>>>  #include <stdio.h>
>>>>  #include <unistd.h>
>>>> -#include <sys/io.h>
>>>>  #include <sys/ioctl.h>
>>>>  #include <sys/mman.h>
>>>>  #include <sys/types.h>
>>>> @@ -44,6 +43,17 @@
>>>>  #include "vfio_pci.h"
>>>>  #include "linux-vfio.h"
>>>>  
>>>> +#ifndef TARGET_PPC64
>>>> +#include <sys/io.h>
>>>> +#define VFIO_IOMMU_EXTENSION    VFIO_X86_IOMMU
>>>> +#else
>>>> +#include "hw/pci_internals.h"
>>>> +#include "hw/xics.h"
>>>> +#include "hw/spapr.h"
>>>> +#define POWERPC_IOMMU           2
>>>> +#define VFIO_IOMMU_EXTENSION    POWERPC_IOMMU
>>>> +#endif
>>>> +
>>>
>>> VFIO_IOMMU_EXTENSION never gets used, POWER_IOMMU is redefined below.
>>
>>
>> Yes, a bit messy. Was not sure about the name so I postponed it.
>>
>>
>>>>  //#define DEBUG_VFIO
>>>>  #ifdef DEBUG_VFIO
>>>>  #define DPRINTF(fmt, ...) \
>>>> @@ -235,6 +245,7 @@ struct vfio_irq_set_fd {
>>>>  
>>>>  static void vfio_enable_intx_kvm(VFIODevice *vdev)
>>>>  {
>>>> +#ifndef TARGET_PPC64
>>>
>>> Why do you need this, aren't the extension checks sufficient for this to
>>> be a nop for you?
>>
>>
>> It uses ioapic_remove_gsi_eoi_notifier() so it needs some #ifdef anyway. And as we do not support
>> kvm_irqchip_in_kernel(), there is no point in fixing it and I disabled it all.
>> When we make eoi notifiers a platform independent, then yes, it will be nop.
> 
> Ah right, forgot you won't even build ioapic_*.
> 
>>>>  #ifdef CONFIG_KVM
>>>>      struct vfio_irq_set_fd irq_set_fd = {
>>>>  	.irq_set = {
>>>> @@ -298,10 +309,12 @@ fail:
>>>>      qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
>>>>      vfio_unmask_intx(vdev);
>>>>  #endif
>>>> +#endif
>>>>  }
>>>>  
>>>>  static void vfio_disable_intx_kvm(VFIODevice *vdev)
>>>>  {
>>>> +#ifndef TARGET_PPC64
>>>
>>> Same
>>
>> Same :)
>>
>>>
>>>>  #ifdef CONFIG_KVM
>>>>      struct vfio_irq_set_fd irq_set_fd = {
>>>>  	.irq_set = {
>>>> @@ -350,8 +363,10 @@ static void vfio_disable_intx_kvm(VFIODevice *vdev)
>>>>      DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", __FUNCTION__,
>>>>              vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
>>>>  #endif
>>>> +#endif
>>>>  }
>>>>  
>>>> +#ifndef TARGET_PPC64
>>>>  static void vfio_update_irq(Notifier *notify, void *data)
>>>>  {
>>>>      VFIODevice *vdev = container_of(notify, VFIODevice, intx.update_irq);
>>>> @@ -381,6 +396,7 @@ static void vfio_update_irq(Notifier *notify, void *data)
>>>>      /* Re-enable the interrupt in cased we missed an EOI */
>>>>      vfio_eoi(&vdev->intx.eoi, NULL);
>>>>  }
>>>> +#endif
>>>>  
>>>>  static int vfio_enable_intx(VFIODevice *vdev)
>>>>  {
>>>> @@ -404,10 +420,14 @@ static int vfio_enable_intx(VFIODevice *vdev)
>>>>      vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
>>>>      vdev->intx.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
>>>>      vdev->intx.eoi.notify = vfio_eoi;
>>>> +#ifndef TARGET_PPC64
>>>>      ioapic_add_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
>>>
>>> This is really only a place holder for x86 too, I don't think my eoi
>>> notifier as written is acceptable upstream.  We really need some common
>>> infrastructure here.  I'm hoping to get the kvm acceleration in place
>>> which would make vfio usable on x86 with kvm (the common case), then
>>> work towards a generic eoi notifier.
>>>
>>>>  
>>>>      vdev->intx.update_irq.notify = vfio_update_irq;
>>>>      pci_add_irq_update_notifier(&vdev->pdev, &vdev->intx.update_irq);
>>>
>>> Can't you stub this out to make it safe to do on POWER too?
>>
>>
>> I could even simply enable it (not sure if it is going to be called ever though but anyway) once we
>> get unified eoi notifiers.
> 
> Right
> 
>>>> +#else
>>>> +    xics_add_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
>>>> +#endif
>>>>  
>>>>      if (event_notifier_init(&vdev->intx.interrupt, 0)) {
>>>>          error_report("vfio: Error: event_notifier_init failed\n");
>>>> @@ -440,8 +460,12 @@ static void vfio_disable_intx(VFIODevice *vdev)
>>>>      vfio_disable_intx_kvm(vdev);
>>>>      vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
>>>>  
>>>> +#ifndef TARGET_PPC64
>>>>      pci_remove_irq_update_notifier(&vdev->intx.update_irq);
>>>>      ioapic_remove_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
>>>> +#else
>>>> +    xics_remove_eoi_notifier(&vdev->intx.eoi);
>>>> +#endif
>>>>  
>>>>      fd = event_notifier_get_fd(&vdev->intx.interrupt);
>>>>      qemu_set_fd_handler(fd, NULL, NULL, vdev);
>>>> @@ -543,7 +567,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>>>>      }
>>>>  
>>>>      fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
>>>> -
>>>> +#ifndef TARGET_PPC64
>>>>      vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>>>>      if (vdev->msi_vectors[vector].virq < 0 || 
>>>>          kvm_irqchip_add_irqfd(kvm_state, fd,
>>>> @@ -551,7 +575,11 @@ static int vfio_msix_vector_use(PCIDevice *pdev,
>>>>          qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>>>>                              &vdev->msi_vectors[vector]);
>>>>      }
>>>> -
>>>> +#else
>>>> +    vdev->msi_vectors[vector].virq = -1;
>>>> +    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>>>> +                        &vdev->msi_vectors[vector]);
>>>> +#endif
>>>
>>> This shouldn't be necessary once the abort is removed from
>>> kvm_irqchip_add_msi_route.  It'll be merged next time the kvm uq tree
>>> merges into qemu.
>>
>>
>> True, I just did not pick up your very last changes. Updating is always painful, and now it is even
>> worse then usual as pci_get_irq has been renamed to something else :) Will do though.
> 
> Yep, I think once Michael is back from holiday and does a pull request
> (and hopefully merges Jan's PCIBus irq routing patches) my tree will be
> down to mostly just the vfio driver and I'll start managing it like the
> kernel tree with a patch series that gets rebased.  I'm hoping that if I
> can get an acceptable level irqfd/eoifd implementation for x86 kvm in
> the kernel that I can rip out the ioapic eoi notifiers and submit the
> code as functional only with kvm and work out the generic eoi notifiers
> in qemu proper.
> 
>>>>      if (vdev->nr_vectors < vector + 1) {
>>>>          int i;
>>>>  
>>>> @@ -692,6 +720,7 @@ retry:
>>>>          fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
>>>>  
>>>>          msg = msi_get_msg(&vdev->pdev, i);
>>>> +#ifndef TARGET_PPC64
>>>>          vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
>>>>          if (vdev->msi_vectors[i].virq < 0 || 
>>>>              kvm_irqchip_add_irqfd(kvm_state, fd,
>>>> @@ -699,6 +728,12 @@ retry:
>>>>              qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>>>>                                  &vdev->msi_vectors[i]);
>>>>          }
>>>> +#else
>>>> +        vdev->msi_vectors[i].virq = -1;
>>>> +        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
>>>> +                            &vdev->msi_vectors[i]);
>>>> +        msg = msg;
>>>> +#endif
>>>
>>> Same here
>>>
>>>>      }
>>>>      
>>>>      ret = vfio_enable_vectors(vdev, false);
>>>> @@ -1581,6 +1616,25 @@ static int vfio_connect_container(VFIOGroup *group)
>>>>  
>>>>          memory_listener_register(&container->listener, get_system_memory());
>>>>  
>>>> +#define POWERPC_IOMMU           2
>>>
>>> Assume this will go in the kernel vfio.h at some point.  You may want to
>>> pick a different name if there's a possibility of other powerpc iommu
>>> implementations... thus the crappy type1 name for x86.
>>>
>>>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
>>>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>>>> +        if (ret) {
>>>> +            error_report("vfio: failed to set group container: %s\n",
>>>> +                         strerror(errno));
>>>> +            g_free(container);
>>>> +            close(fd);
>>>> +            return -1;
>>>> +        }
>>>> +
>>>> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
>>>> +        if (ret) {
>>>> +            error_report("vfio: failed to set iommu for container: %s\n",
>>>> +                         strerror(errno));
>>>> +            g_free(container);
>>>> +            close(fd);
>>>> +            return -1;
>>>> +        }
>>>>      } else {
>>>>          error_report("vfio: No available IOMMU models\n");
>>>>          g_free(container);
>>>> @@ -2005,3 +2059,19 @@ static void register_vfio_pci_dev_type(void)
>>>>  }
>>>>  
>>>>  type_init(register_vfio_pci_dev_type)
>>>> +
>>>> +int vfio_get_container_fd(struct PCIBus *pbus)
>>>> +{
>>>> +    BusChild *kid1st = QTAILQ_FIRST(&pbus->qbus.children);
>>>> +    VFIODevice *vdev1st;
>>>> +
>>>> +    if (!kid1st) {
>>>> +        printf("No device registered on PCI bus \"%s\", no DMA enabled\n",
>>>> +               pbus->qbus.name);
>>>> +        return -1;
>>>> +    }
>>>> +    vdev1st = container_of(kid1st->child, VFIODevice, pdev.qdev);
>>>> +
>>>> +    return vdev1st->group->container->fd;
>>>> +}
>>>> +
>>>
>>> This is not a generic implementation.  x86 won't have all devices on a
>>> bus be vfio devices and even if it did, there's no guarantee they all
>>> belong to the same container.  This should probably at least take a
>>> PCIDevice and some kind of POWER specific code will need to know that
>>> the container is the same for the whole bus.  Thanks,
>>
>>
>> This is a workaround, true. x86 does not need this call at all. And on powerpc VFIO devices won't
>> share PCI bus with emulated devices. I just need some API to get this fd.
>>
>> Well I probably can add MemoryListener for the DMA window and move all power-specific map/unmap code
>> to VFIO but it does not look much better. I would rather prefer separating IOMMU code from vfio_pci
>> somehow (more or less as it is now for powerpc). While doing it, we could think of the API to get
>> this fd which we need anyway in order to setup the DMA window which is per group (which QEMU does
>> not understand) but not per device.
> 
> The MemoryListener probably doesn't make sense with a guest driven iova
> window.  It would be an abuse of the interface I think.  At some level
> in the power code you, or at least the user, needs to know about groups
> though.  That's how you end up with an emulated bridge in front of each
> group, right?


No, this is just for interrupts swizzling. We put one group to a separate PCI bus and we do not care
about bridges on this matter.


> So with that same knowledge, shouldn't the API simply be:
> 
> int vfio_get_container_fd(PCIDevice *dev)
> 
> where power code picks a device from the bus since you know they're all
> in the same group?  Thanks,

Yep but still workaround.
A, screw this API, I came up with something different, I post it a bit later :)

Patch

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index f573a95..c46a049 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -25,4 +25,7 @@  obj-$(CONFIG_FDT) += ../device_tree.o
 # Xilinx PPC peripherals
 obj-y += xilinx_ethlite.o
 
+# VFIO PCI device assignment
+obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
+
 obj-y := $(addprefix ../,$(obj-y))
diff --git a/hw/spapr.h b/hw/spapr.h
index b37f337..9dca704 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -340,4 +340,8 @@  int spapr_dma_dt(void *fdt, int node_off, const char *propname,
 int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
                       DMAContext *dma);
 
+void spapr_vfio_init_dma(int fd, uint32_t liobn,
+                         uint64_t *dma32_window_start,
+                         uint64_t *dma32_window_size);
+
 #endif /* !defined (__HW_SPAPR_H__) */
diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
index 50c288d..0a194e8 100644
--- a/hw/spapr_iommu.c
+++ b/hw/spapr_iommu.c
@@ -16,6 +16,8 @@ 
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
+#include <sys/ioctl.h>
+
 #include "hw.h"
 #include "kvm.h"
 #include "qdev.h"
@@ -23,6 +25,7 @@ 
 #include "dma.h"
 
 #include "hw/spapr.h"
+#include "hw/linux-vfio.h"
 
 #include <libfdt.h>
 
@@ -183,6 +186,86 @@  static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
     return 0;
 }
 
+/* -------- API for POWERPC IOMMU -------- */
+
+#define POWERPC_IOMMU           2
+
+struct tce_iommu_info {
+    __u32 argsz;
+    __u32 dma32_window_start;
+    __u32 dma32_window_size;
+};
+
+#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct tce_iommu_dma_map {
+    __u32 argsz;
+    __u64 va;
+    __u64 dmaaddr;
+};
+
+#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
+#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+typedef struct sPAPRVFIOTable {
+    int fd;
+    uint32_t liobn;
+    QLIST_ENTRY(sPAPRVFIOTable) list;
+} sPAPRVFIOTable;
+
+QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
+
+void spapr_vfio_init_dma(int fd, uint32_t liobn,
+                         uint64_t *dma32_window_start,
+                         uint64_t *dma32_window_size)
+{
+    sPAPRVFIOTable *t;
+    struct tce_iommu_info info = { .argsz = sizeof(info) };
+
+    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
+        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
+        return;
+    }
+    *dma32_window_start = info.dma32_window_start;
+    *dma32_window_size = info.dma32_window_size;
+
+    t = g_malloc0(sizeof(*t));
+    t->fd = fd;
+    t->liobn = liobn;
+
+    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
+}
+
+static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
+{
+    sPAPRVFIOTable *t;
+    struct tce_iommu_dma_map map = {
+        .argsz = sizeof(map),
+        .va = 0,
+        .dmaaddr = ioba,
+    };
+
+    QLIST_FOREACH(t, &vfio_tce_tables, list) {
+        if (t->liobn != liobn) {
+            continue;
+        }
+        if (tce) {
+            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
+            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
+                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
+                return H_PARAMETER;
+            }
+        } else {
+            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
+                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
+                return H_PARAMETER;
+            }
+        }
+        return H_SUCCESS;
+    }
+    return H_CONTINUE; /* positive non-zero value */
+}
+
 static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
                               target_ulong opcode, target_ulong *args)
 {
@@ -203,6 +286,10 @@  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
     if (0 >= ret) {
         return ret ? H_PARAMETER : H_SUCCESS;
     }
+    ret = put_tce_vfio(liobn, ioba, tce);
+    if (0 >= ret) {
+        return ret ? H_PARAMETER : H_SUCCESS;
+    }
 #ifdef DEBUG_TCE
     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
             "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
index 5f89003..3375c3f 100644
--- a/hw/spapr_pci.c
+++ b/hw/spapr_pci.c
@@ -29,6 +29,7 @@ 
 #include "pci_host.h"
 #include "hw/spapr.h"
 #include "hw/spapr_pci.h"
+#include "hw/vfio_pci.h"
 #include "exec-memory.h"
 #include <libfdt.h>
 #include "trace.h"
@@ -440,6 +441,12 @@  static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
                  level);
 }
 
+static int pci_spapr_get_irq(void *opaque, int irq_num)
+{
+    sPAPRPHBState *phb = opaque;
+    return phb->lsi_table[irq_num].dt_irq;
+}
+
 static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
                               unsigned size)
 {
@@ -567,7 +574,8 @@  static int spapr_phb_init(SysBusDevice *s)
 
     bus = pci_register_bus(&phb->host_state.busdev.qdev,
                            phb->busname ? phb->busname : phb->dtbusname,
-                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
+                           pci_spapr_set_irq, pci_spapr_get_irq,
+                           pci_spapr_map_irq, phb,
                            &phb->memspace, &phb->iospace,
                            PCI_DEVFN(0, 0), PCI_NUM_PINS);
     phb->host_state.bus = bus;
@@ -596,6 +604,7 @@  static Property spapr_phb_properties[] = {
     DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
     DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
     DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
+    DEFINE_PROP_UINT8("vfio", sPAPRPHBState, is_vfio, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -639,6 +648,18 @@  void spapr_create_phb(sPAPREnvironment *spapr,
 /* Finalize PCI setup, called when all devices are already created */
 int spapr_finalize_pci_setup(sPAPRPHBState *phb)
 {
+    if (phb->is_vfio) {
+        int fd = vfio_get_container_fd(phb->host_state.bus);
+
+        if (fd < 0) {
+            return fd;
+        }
+        spapr_vfio_init_dma(fd, phb->dma_liobn,
+                            &phb->dma_window_start,
+                            &phb->dma_window_size);
+        return 0;
+    }
+
     phb->dma_window_start = 0;
     phb->dma_window_size = 0x40000000;
     phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
index 3aae273..a4f031b 100644
--- a/hw/spapr_pci.h
+++ b/hw/spapr_pci.h
@@ -57,6 +57,8 @@  typedef struct sPAPRPHBState {
         int nvec;
     } msi_table[SPAPR_MSIX_MAX_DEVS];
 
+    uint8_t is_vfio;
+
     QLIST_ENTRY(sPAPRPHBState) list;
 } sPAPRPHBState;
 
diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 1ac287f..cc0b974 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -21,7 +21,6 @@ 
 #include <dirent.h>
 #include <stdio.h>
 #include <unistd.h>
-#include <sys/io.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/types.h>
@@ -44,6 +43,17 @@ 
 #include "vfio_pci.h"
 #include "linux-vfio.h"
 
+#ifndef TARGET_PPC64
+#include <sys/io.h>
+#define VFIO_IOMMU_EXTENSION    VFIO_X86_IOMMU
+#else
+#include "hw/pci_internals.h"
+#include "hw/xics.h"
+#include "hw/spapr.h"
+#define POWERPC_IOMMU           2
+#define VFIO_IOMMU_EXTENSION    POWERPC_IOMMU
+#endif
+
 //#define DEBUG_VFIO
 #ifdef DEBUG_VFIO
 #define DPRINTF(fmt, ...) \
@@ -235,6 +245,7 @@  struct vfio_irq_set_fd {
 
 static void vfio_enable_intx_kvm(VFIODevice *vdev)
 {
+#ifndef TARGET_PPC64
 #ifdef CONFIG_KVM
     struct vfio_irq_set_fd irq_set_fd = {
 	.irq_set = {
@@ -298,10 +309,12 @@  fail:
     qemu_set_fd_handler(irqfd.fd, vfio_intx_interrupt, NULL, vdev);
     vfio_unmask_intx(vdev);
 #endif
+#endif
 }
 
 static void vfio_disable_intx_kvm(VFIODevice *vdev)
 {
+#ifndef TARGET_PPC64
 #ifdef CONFIG_KVM
     struct vfio_irq_set_fd irq_set_fd = {
 	.irq_set = {
@@ -350,8 +363,10 @@  static void vfio_disable_intx_kvm(VFIODevice *vdev)
     DPRINTF("%s(%04x:%02x:%02x.%x) KVM INTx accel disabled\n", __FUNCTION__,
             vdev->host.seg, vdev->host.bus, vdev->host.dev, vdev->host.func);
 #endif
+#endif
 }
 
+#ifndef TARGET_PPC64
 static void vfio_update_irq(Notifier *notify, void *data)
 {
     VFIODevice *vdev = container_of(notify, VFIODevice, intx.update_irq);
@@ -381,6 +396,7 @@  static void vfio_update_irq(Notifier *notify, void *data)
     /* Re-enable the interrupt in cased we missed an EOI */
     vfio_eoi(&vdev->intx.eoi, NULL);
 }
+#endif
 
 static int vfio_enable_intx(VFIODevice *vdev)
 {
@@ -404,10 +420,14 @@  static int vfio_enable_intx(VFIODevice *vdev)
     vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
     vdev->intx.irq = pci_get_irq(&vdev->pdev, vdev->intx.pin);
     vdev->intx.eoi.notify = vfio_eoi;
+#ifndef TARGET_PPC64
     ioapic_add_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
 
     vdev->intx.update_irq.notify = vfio_update_irq;
     pci_add_irq_update_notifier(&vdev->pdev, &vdev->intx.update_irq);
+#else
+    xics_add_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
+#endif
 
     if (event_notifier_init(&vdev->intx.interrupt, 0)) {
         error_report("vfio: Error: event_notifier_init failed\n");
@@ -440,8 +460,12 @@  static void vfio_disable_intx(VFIODevice *vdev)
     vfio_disable_intx_kvm(vdev);
     vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
 
+#ifndef TARGET_PPC64
     pci_remove_irq_update_notifier(&vdev->intx.update_irq);
     ioapic_remove_gsi_eoi_notifier(&vdev->intx.eoi, vdev->intx.irq);
+#else
+    xics_remove_eoi_notifier(&vdev->intx.eoi);
+#endif
 
     fd = event_notifier_get_fd(&vdev->intx.interrupt);
     qemu_set_fd_handler(fd, NULL, NULL, vdev);
@@ -543,7 +567,7 @@  static int vfio_msix_vector_use(PCIDevice *pdev,
     }
 
     fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
-
+#ifndef TARGET_PPC64
     vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
     if (vdev->msi_vectors[vector].virq < 0 || 
         kvm_irqchip_add_irqfd(kvm_state, fd,
@@ -551,7 +575,11 @@  static int vfio_msix_vector_use(PCIDevice *pdev,
         qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
                             &vdev->msi_vectors[vector]);
     }
-
+#else
+    vdev->msi_vectors[vector].virq = -1;
+    qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
+                        &vdev->msi_vectors[vector]);
+#endif
     if (vdev->nr_vectors < vector + 1) {
         int i;
 
@@ -692,6 +720,7 @@  retry:
         fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
 
         msg = msi_get_msg(&vdev->pdev, i);
+#ifndef TARGET_PPC64
         vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
         if (vdev->msi_vectors[i].virq < 0 || 
             kvm_irqchip_add_irqfd(kvm_state, fd,
@@ -699,6 +728,12 @@  retry:
             qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
                                 &vdev->msi_vectors[i]);
         }
+#else
+        vdev->msi_vectors[i].virq = -1;
+        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
+                            &vdev->msi_vectors[i]);
+        msg = msg;
+#endif
     }
     
     ret = vfio_enable_vectors(vdev, false);
@@ -1581,6 +1616,25 @@  static int vfio_connect_container(VFIOGroup *group)
 
         memory_listener_register(&container->listener, get_system_memory());
 
+#define POWERPC_IOMMU           2
+    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
+        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
+        if (ret) {
+            error_report("vfio: failed to set group container: %s\n",
+                         strerror(errno));
+            g_free(container);
+            close(fd);
+            return -1;
+        }
+
+        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
+        if (ret) {
+            error_report("vfio: failed to set iommu for container: %s\n",
+                         strerror(errno));
+            g_free(container);
+            close(fd);
+            return -1;
+        }
     } else {
         error_report("vfio: No available IOMMU models\n");
         g_free(container);
@@ -2005,3 +2059,19 @@  static void register_vfio_pci_dev_type(void)
 }
 
 type_init(register_vfio_pci_dev_type)
+
+int vfio_get_container_fd(struct PCIBus *pbus)
+{
+    BusChild *kid1st = QTAILQ_FIRST(&pbus->qbus.children);
+    VFIODevice *vdev1st;
+
+    if (!kid1st) {
+        printf("No device registered on PCI bus \"%s\", no DMA enabled\n",
+               pbus->qbus.name);
+        return -1;
+    }
+    vdev1st = container_of(kid1st->child, VFIODevice, pdev.qdev);
+
+    return vdev1st->group->container->fd;
+}
+
diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
index 226607c..0d13341 100644
--- a/hw/vfio_pci.h
+++ b/hw/vfio_pci.h
@@ -105,4 +105,6 @@  typedef struct VFIOGroup {
 #define VFIO_FLAG_IOMMU_SHARED_BIT 0
 #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
 
+int vfio_get_container_fd(struct PCIBus *pbus);
+
 #endif /* __VFIO_H__ */