Patchwork RFC: vfio-powerpc: added VFIO support (v2)

login
register
mail settings
Submitter Alexey Kardashevskiy
Date July 12, 2012, 8:52 a.m.
Message ID <1342083134-28669-1-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/170614/
State New
Headers show

Comments

Alexey Kardashevskiy - July 12, 2012, 8:52 a.m.
It literally does the following:

1. POWERPC IOMMU support (the kernel counterpart is required)

2. The patch assumes that IOAPIC calls are going to be replaced
with something generic. I have something in my local git but it's
too early, we need to extend PCIINTxRoute first.

3. vfio_get_group() made public. I want to open IOMMU group from
the sPAPR code to have everything I need for VFIO on sPAPR and
avoid ugly workarounds with finilizing PHB setup on sPAPR.

4. Change sPAPR PHB to scan the PCI bus which is used for
the IOMMU-VFIO group. Now it is enough to add the following to
the QEMU command line to get VFIO up with all the devices from
IOMMU group with id=3:
-device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/ppc/Makefile.objs |    3 ++
 hw/spapr.h           |    4 ++
 hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++
 hw/spapr_pci.c       |  115 +++++++++++++++++++++++++++++++++++++++++++++++---
 hw/spapr_pci.h       |    5 +++
 hw/vfio_pci.c        |   28 +++++++++++-
 hw/vfio_pci.h        |    2 +
 7 files changed, 237 insertions(+), 7 deletions(-)
Blue Swirl - July 12, 2012, 8:54 p.m.
On Thu, Jul 12, 2012 at 8:52 AM, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> It literally does the following:
>
> 1. POWERPC IOMMU support (the kernel counterpart is required)
>
> 2. The patch assumes that IOAPIC calls are going to be replaced
> with something generic. I have something in my local git but it's
> too early, we need to extend PCIINTxRoute first.
>
> 3. vfio_get_group() made public. I want to open IOMMU group from
> the sPAPR code to have everything I need for VFIO on sPAPR and
> avoid ugly workarounds with finilizing PHB setup on sPAPR.
>
> 4. Change sPAPR PHB to scan the PCI bus which is used for
> the IOMMU-VFIO group. Now it is enough to add the following to
> the QEMU command line to get VFIO up with all the devices from
> IOMMU group with id=3:
> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/ppc/Makefile.objs |    3 ++
>  hw/spapr.h           |    4 ++
>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++
>  hw/spapr_pci.c       |  115 +++++++++++++++++++++++++++++++++++++++++++++++---
>  hw/spapr_pci.h       |    5 +++
>  hw/vfio_pci.c        |   28 +++++++++++-
>  hw/vfio_pci.h        |    2 +
>  7 files changed, 237 insertions(+), 7 deletions(-)
>
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index f573a95..c46a049 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>  # Xilinx PPC peripherals
>  obj-y += xilinx_ethlite.o
>
> +# VFIO PCI device assignment
> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> +
>  obj-y := $(addprefix ../,$(obj-y))
> diff --git a/hw/spapr.h b/hw/spapr.h
> index b37f337..9dca704 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>                        DMAContext *dma);
>
> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size);
> +
>  #endif /* !defined (__HW_SPAPR_H__) */
> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> index 50c288d..0a194e8 100644
> --- a/hw/spapr_iommu.c
> +++ b/hw/spapr_iommu.c
> @@ -16,6 +16,8 @@
>   * You should have received a copy of the GNU Lesser General Public
>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>   */
> +#include <sys/ioctl.h>
> +
>  #include "hw.h"
>  #include "kvm.h"
>  #include "qdev.h"
> @@ -23,6 +25,7 @@
>  #include "dma.h"
>
>  #include "hw/spapr.h"
> +#include "hw/linux-vfio.h"
>
>  #include <libfdt.h>
>
> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>      return 0;
>  }
>
> +/* -------- API for POWERPC IOMMU -------- */
> +
> +#define POWERPC_IOMMU           2
> +
> +struct tce_iommu_info {

CamelCase.

> +    __u32 argsz;
> +    __u32 dma32_window_start;
> +    __u32 dma32_window_size;

Please use uint32_t.

> +};
> +
> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +struct tce_iommu_dma_map {
> +    __u32 argsz;

The structure may or may not be padded here since there's no
QEMU_PACKED attribute. If possible, just rearrange the fields.

> +    __u64 va;
> +    __u64 dmaaddr;
> +};
> +
> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> +
> +typedef struct sPAPRVFIOTable {
> +    int fd;
> +    uint32_t liobn;
> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> +} sPAPRVFIOTable;
> +
> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> +
> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> +
> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
> +        return;
> +    }
> +    *dma32_window_start = info.dma32_window_start;
> +    *dma32_window_size = info.dma32_window_size;
> +
> +    t = g_malloc0(sizeof(*t));
> +    t->fd = fd;
> +    t->liobn = liobn;
> +
> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> +}
> +
> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_dma_map map = {
> +        .argsz = sizeof(map),
> +        .va = 0,
> +        .dmaaddr = ioba,
> +    };
> +
> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> +        if (t->liobn != liobn) {
> +            continue;
> +        }
> +        if (tce) {
> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);

perror()?

> +                return H_PARAMETER;
> +            }
> +        } else {
> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
> +                return H_PARAMETER;
> +            }
> +        }
> +        return H_SUCCESS;
> +    }
> +    return H_CONTINUE; /* positive non-zero value */
> +}
> +
>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>                                target_ulong opcode, target_ulong *args)
>  {
> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>      if (0 >= ret) {
>          return ret ? H_PARAMETER : H_SUCCESS;
>      }
> +    ret = put_tce_vfio(liobn, ioba, tce);
> +    if (0 >= ret) {

This order in expressions is not common, please reverse.

> +        return ret ? H_PARAMETER : H_SUCCESS;
> +    }
>  #ifdef DEBUG_TCE
>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 014297b..92c48b6 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -22,6 +22,9 @@
>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>   * THE SOFTWARE.
>   */
> +#include <sys/types.h>
> +#include <dirent.h>
> +
>  #include "hw.h"
>  #include "pci.h"
>  #include "msi.h"
> @@ -29,10 +32,10 @@
>  #include "pci_host.h"
>  #include "hw/spapr.h"
>  #include "hw/spapr_pci.h"
> +#include "hw/vfio_pci.h"
>  #include "exec-memory.h"
>  #include <libfdt.h>
>  #include "trace.h"
> -
>  #include "hw/pci_internals.h"
>
>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> @@ -440,6 +443,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>                   level);
>  }
>
> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> +{
> +    sPAPRPHBState *phb = opaque;
> +    return phb->lsi_table[irq_num].dt_irq;
> +}
> +
>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>                                unsigned size)
>  {
> @@ -515,6 +524,82 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>      return phb->dma;
>  }
>
> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
> +{
> +    char iommupath[256];
> +    DIR *dirp;
> +    struct dirent *entry;
> +
> +    phb->iommugroup = vfio_get_group(phb->iommugroupid);
> +    if (!phb->iommugroup) {
> +        fprintf(stderr, "Cannot open IOMMU group %d\n", phb->iommugroupid);
> +        return -1;
> +    }
> +
> +    if (!phb->scan) {
> +        printf("Autoscan disabled\n");
> +        return 0;
> +    }
> +
> +    sprintf(iommupath, "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);

Please use snprintf() or g_strdup_printf().


> +    dirp = opendir(iommupath);
> +
> +    while ((entry = readdir(dirp)) != NULL) {
> +        char *tmp = alloca(strlen(iommupath) + strlen(entry->d_name) + 32);
> +        FILE *deviceclassfile;
> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> +        char addr[32];
> +        DeviceState *dev;
> +
> +        if (4 != sscanf(entry->d_name, "%X:%X:%X.%x",

Please put the constant last.

> +                        &domainid, &busid, &devid, &fnid)) {
> +            continue;
> +        }
> +
> +        sprintf(tmp, "%s%s/class", iommupath, entry->d_name);

Again, snprintf() or g_strdup_printf() (which avoids the alloca() too).

> +        printf("Reading device class from %s\n", tmp);

Leftover debugging?

> +
> +        deviceclassfile = fopen(tmp, "r");
> +        if (deviceclassfile) {
> +            fscanf(deviceclassfile, "%x", &deviceclass);
> +            fclose(deviceclassfile);
> +        }
> +        if (!deviceclass) {
> +            continue;
> +        }
> +#define PCI_BASE_CLASS_BRIDGE           0x06

This belongs to pci_ids.h.

> +        if ((phb->scan < 2) && ((deviceclass >> 16) == PCI_BASE_CLASS_BRIDGE)) {
> +            continue;
> +        }
> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
> +            /* Tweak USB */
> +            phb->force_addr = 1;
> +            phb->enable_multifunction = 1;
> +        }
> +
> +        printf("Creating device %X:%X:%X.%x class=0x%X\n",
> +               domainid, busid, devid, fnid, deviceclass);

Lower case hex, please.

> +
> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
> +        if (!dev) {
> +            fprintf(stderr, "failed to create vfio-pci\n");
> +            continue;
> +        }
> +        qdev_prop_parse(dev, "host", entry->d_name);
> +        if (phb->force_addr) {
> +            sprintf(addr, "%X.%X", devid, fnid);

snprintf, lower case hex.

> +            qdev_prop_parse(dev, "addr", addr);
> +        }
> +        if (phb->enable_multifunction) {
> +            qdev_prop_set_bit(dev, "multifunction", 1);
> +        }
> +        qdev_init_nofail(dev);
> +    }
> +    closedir(dirp);
> +
> +    return 0;
> +}
> +
>  static int spapr_phb_init(SysBusDevice *s)
>  {
>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
> @@ -567,15 +652,13 @@ static int spapr_phb_init(SysBusDevice *s)
>
>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>                             phb->busname ? phb->busname : phb->dtbusname,
> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> +                           pci_spapr_map_irq, phb,
>                             &phb->memspace, &phb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>      phb->host_state.bus = bus;
>
>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
> -    phb->dma_window_start = 0;
> -    phb->dma_window_size = 0x40000000;
> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>
>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
> @@ -588,6 +671,24 @@ static int spapr_phb_init(SysBusDevice *s)
>          }
>      }
>
> +    if (phb->iommugroupid >= 0) {
> +        if (0 > spapr_pci_scan_vfio(phb)) {

Order.

> +            return -1;
> +        }
> +        if (!phb->iommugroup || !phb->iommugroup->container) {
> +            return -1;
> +        }
> +        spapr_vfio_init_dma(phb->iommugroup->container->fd, phb->dma_liobn,
> +                            &phb->dma_window_start,
> +                            &phb->dma_window_size);
> +        return 0;
> +    }
> +
> +    phb->dma_window_start = 0;
> +    phb->dma_window_size = 0x40000000;
> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> +                                         phb->dma_window_size);
> +
>      return 0;
>  }
>
> @@ -599,6 +700,10 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), /* 0 don't 1 +devices 2 +buses */
> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>
> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> index 145071c..1953a74 100644
> --- a/hw/spapr_pci.h
> +++ b/hw/spapr_pci.h
> @@ -57,6 +57,11 @@ typedef struct sPAPRPHBState {
>          int nvec;
>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>
> +    int32_t iommugroupid;
> +    struct VFIOGroup *iommugroup;
> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
> +    uint8_t enable_multifunction, force_addr;

Use bool for both?

> +
>      QLIST_ENTRY(sPAPRPHBState) list;
>  } sPAPRPHBState;
>
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 1ac287f..73681fb 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -21,7 +21,6 @@
>  #include <dirent.h>
>  #include <stdio.h>
>  #include <unistd.h>
> -#include <sys/io.h>
>  #include <sys/ioctl.h>
>  #include <sys/mman.h>
>  #include <sys/types.h>
> @@ -43,6 +42,12 @@
>  #include "range.h"
>  #include "vfio_pci.h"
>  #include "linux-vfio.h"
> +#ifndef TARGET_PPC64
> +#include <sys/io.h>
> +#else
> +#include "hw/pci_internals.h"
> +#include "hw/spapr.h"
> +#endif
>
>  //#define DEBUG_VFIO
>  #ifdef DEBUG_VFIO
> @@ -1581,6 +1586,25 @@ static int vfio_connect_container(VFIOGroup *group)
>
>          memory_listener_register(&container->listener, get_system_memory());
>
> +#define POWERPC_IOMMU           2
> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> +        if (ret) {
> +            error_report("vfio: failed to set group container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
> +
> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
> +        if (ret) {
> +            error_report("vfio: failed to set iommu for container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
>      } else {
>          error_report("vfio: No available IOMMU models\n");
>          g_free(container);
> @@ -1620,7 +1644,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
>      }
>  }
>
> -static VFIOGroup *vfio_get_group(int groupid)
> +VFIOGroup *vfio_get_group(int groupid)
>  {
>      VFIOGroup *group;
>      char path[32];
> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> index 226607c..d63dd63 100644
> --- a/hw/vfio_pci.h
> +++ b/hw/vfio_pci.h
> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>
> +VFIOGroup *vfio_get_group(int groupid);
> +
>  #endif /* __VFIO_H__ */
> --
> 1.7.10
>
>
Alex Williamson - July 12, 2012, 9:37 p.m.
On Thu, 2012-07-12 at 20:54 +0000, Blue Swirl wrote:
> On Thu, Jul 12, 2012 at 8:52 AM, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> > It literally does the following:
> >
> > 1. POWERPC IOMMU support (the kernel counterpart is required)
> >
> > 2. The patch assumes that IOAPIC calls are going to be replaced
> > with something generic. I have something in my local git but it's
> > too early, we need to extend PCIINTxRoute first.
> >
> > 3. vfio_get_group() made public. I want to open IOMMU group from
> > the sPAPR code to have everything I need for VFIO on sPAPR and
> > avoid ugly workarounds with finilizing PHB setup on sPAPR.
> >
> > 4. Change sPAPR PHB to scan the PCI bus which is used for
> > the IOMMU-VFIO group. Now it is enough to add the following to
> > the QEMU command line to get VFIO up with all the devices from
> > IOMMU group with id=3:
> > -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
> > mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
> >
> > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > ---
> >  hw/ppc/Makefile.objs |    3 ++
> >  hw/spapr.h           |    4 ++
> >  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++
> >  hw/spapr_pci.c       |  115 +++++++++++++++++++++++++++++++++++++++++++++++---
> >  hw/spapr_pci.h       |    5 +++
> >  hw/vfio_pci.c        |   28 +++++++++++-
> >  hw/vfio_pci.h        |    2 +
> >  7 files changed, 237 insertions(+), 7 deletions(-)
> >
> > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> > index f573a95..c46a049 100644
> > --- a/hw/ppc/Makefile.objs
> > +++ b/hw/ppc/Makefile.objs
> > @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
> >  # Xilinx PPC peripherals
> >  obj-y += xilinx_ethlite.o
> >
> > +# VFIO PCI device assignment
> > +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> > +
> >  obj-y := $(addprefix ../,$(obj-y))
> > diff --git a/hw/spapr.h b/hw/spapr.h
> > index b37f337..9dca704 100644
> > --- a/hw/spapr.h
> > +++ b/hw/spapr.h
> > @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> >  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
> >                        DMAContext *dma);
> >
> > +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> > +                         uint64_t *dma32_window_start,
> > +                         uint64_t *dma32_window_size);
> > +
> >  #endif /* !defined (__HW_SPAPR_H__) */
> > diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> > index 50c288d..0a194e8 100644
> > --- a/hw/spapr_iommu.c
> > +++ b/hw/spapr_iommu.c
> > @@ -16,6 +16,8 @@
> >   * You should have received a copy of the GNU Lesser General Public
> >   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> >   */
> > +#include <sys/ioctl.h>
> > +
> >  #include "hw.h"
> >  #include "kvm.h"
> >  #include "qdev.h"
> > @@ -23,6 +25,7 @@
> >  #include "dma.h"
> >
> >  #include "hw/spapr.h"
> > +#include "hw/linux-vfio.h"
> >
> >  #include <libfdt.h>
> >
> > @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
> >      return 0;
> >  }
> >
> > +/* -------- API for POWERPC IOMMU -------- */
> > +
> > +#define POWERPC_IOMMU           2
> > +
> > +struct tce_iommu_info {
> 
> CamelCase.
> 
> > +    __u32 argsz;
> > +    __u32 dma32_window_start;
> > +    __u32 dma32_window_size;
> 
> Please use uint32_t.

These should eventually be included from a kernel header file.  I assume
that's the reason for the non-qemu-isms.

> > +};
> > +
> > +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> > +
> > +struct tce_iommu_dma_map {
> > +    __u32 argsz;
> 
> The structure may or may not be padded here since there's no
> QEMU_PACKED attribute. If possible, just rearrange the fields.

I'm hoping Alexey adds a __u32 flags here, which solves that problem as
well.  Thanks,

Alex
Scott Wood - July 12, 2012, 10:35 p.m.
On 07/12/2012 03:52 AM, Alexey Kardashevskiy wrote:
> +/* -------- API for POWERPC IOMMU -------- */
> +
> +#define POWERPC_IOMMU           2
> +
> +struct tce_iommu_info {
> +    __u32 argsz;
> +    __u32 dma32_window_start;
> +    __u32 dma32_window_size;
> +};
> +
> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +struct tce_iommu_dma_map {
> +    __u32 argsz;
> +    __u64 va;
> +    __u64 dmaaddr;
> +};
> +
> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)

I thought you were going to change the name to be less generic...

-Scott
Alex Williamson - July 13, 2012, 3:47 a.m.
On Thu, 2012-07-12 at 18:52 +1000, Alexey Kardashevskiy wrote:
> It literally does the following:
> 
> 1. POWERPC IOMMU support (the kernel counterpart is required)
> 
> 2. The patch assumes that IOAPIC calls are going to be replaced
> with something generic. I have something in my local git but it's
> too early, we need to extend PCIINTxRoute first.
> 
> 3. vfio_get_group() made public. I want to open IOMMU group from
> the sPAPR code to have everything I need for VFIO on sPAPR and
> avoid ugly workarounds with finilizing PHB setup on sPAPR.
> 
> 4. Change sPAPR PHB to scan the PCI bus which is used for
> the IOMMU-VFIO group. Now it is enough to add the following to
> the QEMU command line to get VFIO up with all the devices from
> IOMMU group with id=3:
> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/ppc/Makefile.objs |    3 ++
>  hw/spapr.h           |    4 ++
>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++
>  hw/spapr_pci.c       |  115 +++++++++++++++++++++++++++++++++++++++++++++++---
>  hw/spapr_pci.h       |    5 +++
>  hw/vfio_pci.c        |   28 +++++++++++-
>  hw/vfio_pci.h        |    2 +
>  7 files changed, 237 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index f573a95..c46a049 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>  # Xilinx PPC peripherals
>  obj-y += xilinx_ethlite.o
>  
> +# VFIO PCI device assignment
> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> +
>  obj-y := $(addprefix ../,$(obj-y))
> diff --git a/hw/spapr.h b/hw/spapr.h
> index b37f337..9dca704 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>                        DMAContext *dma);
>  
> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size);
> +
>  #endif /* !defined (__HW_SPAPR_H__) */
> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> index 50c288d..0a194e8 100644
> --- a/hw/spapr_iommu.c
> +++ b/hw/spapr_iommu.c
> @@ -16,6 +16,8 @@
>   * You should have received a copy of the GNU Lesser General Public
>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>   */
> +#include <sys/ioctl.h>
> +
>  #include "hw.h"
>  #include "kvm.h"
>  #include "qdev.h"
> @@ -23,6 +25,7 @@
>  #include "dma.h"
>  
>  #include "hw/spapr.h"
> +#include "hw/linux-vfio.h"
>  
>  #include <libfdt.h>
>  
> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>      return 0;
>  }
>  
> +/* -------- API for POWERPC IOMMU -------- */
> +
> +#define POWERPC_IOMMU           2
> +
> +struct tce_iommu_info {
> +    __u32 argsz;
> +    __u32 dma32_window_start;
> +    __u32 dma32_window_size;
> +};
> +
> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +struct tce_iommu_dma_map {
> +    __u32 argsz;
> +    __u64 va;
> +    __u64 dmaaddr;
> +};
> +
> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> +
> +typedef struct sPAPRVFIOTable {
> +    int fd;
> +    uint32_t liobn;
> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> +} sPAPRVFIOTable;
> +
> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> +
> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> +
> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
> +        return;
> +    }
> +    *dma32_window_start = info.dma32_window_start;
> +    *dma32_window_size = info.dma32_window_size;
> +
> +    t = g_malloc0(sizeof(*t));
> +    t->fd = fd;
> +    t->liobn = liobn;
> +
> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> +}
> +
> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_dma_map map = {
> +        .argsz = sizeof(map),
> +        .va = 0,
> +        .dmaaddr = ioba,
> +    };
> +
> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> +        if (t->liobn != liobn) {
> +            continue;
> +        }
> +        if (tce) {
> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
> +                return H_PARAMETER;
> +            }
> +        } else {
> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
> +                return H_PARAMETER;
> +            }
> +        }
> +        return H_SUCCESS;
> +    }
> +    return H_CONTINUE; /* positive non-zero value */
> +}
> +
>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>                                target_ulong opcode, target_ulong *args)
>  {
> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>      if (0 >= ret) {
>          return ret ? H_PARAMETER : H_SUCCESS;
>      }
> +    ret = put_tce_vfio(liobn, ioba, tce);
> +    if (0 >= ret) {
> +        return ret ? H_PARAMETER : H_SUCCESS;
> +    }
>  #ifdef DEBUG_TCE
>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 014297b..92c48b6 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -22,6 +22,9 @@
>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>   * THE SOFTWARE.
>   */
> +#include <sys/types.h>
> +#include <dirent.h>
> +
>  #include "hw.h"
>  #include "pci.h"
>  #include "msi.h"
> @@ -29,10 +32,10 @@
>  #include "pci_host.h"
>  #include "hw/spapr.h"
>  #include "hw/spapr_pci.h"
> +#include "hw/vfio_pci.h"

:-\

>  #include "exec-memory.h"
>  #include <libfdt.h>
>  #include "trace.h"
> -
>  #include "hw/pci_internals.h"
>  
>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> @@ -440,6 +443,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>                   level);
>  }
>  
> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> +{
> +    sPAPRPHBState *phb = opaque;
> +    return phb->lsi_table[irq_num].dt_irq;
> +}
> +
>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>                                unsigned size)
>  {
> @@ -515,6 +524,82 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>      return phb->dma;
>  }
>  
> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
> +{
> +    char iommupath[256];
> +    DIR *dirp;
> +    struct dirent *entry;
> +
> +    phb->iommugroup = vfio_get_group(phb->iommugroupid);
> +    if (!phb->iommugroup) {
> +        fprintf(stderr, "Cannot open IOMMU group %d\n", phb->iommugroupid);
> +        return -1;
> +    }
> +
> +    if (!phb->scan) {
> +        printf("Autoscan disabled\n");
> +        return 0;
> +    }
> +
> +    sprintf(iommupath, "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
> +    dirp = opendir(iommupath);
> +
> +    while ((entry = readdir(dirp)) != NULL) {
> +        char *tmp = alloca(strlen(iommupath) + strlen(entry->d_name) + 32);
> +        FILE *deviceclassfile;
> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> +        char addr[32];
> +        DeviceState *dev;
> +
> +        if (4 != sscanf(entry->d_name, "%X:%X:%X.%x",
> +                        &domainid, &busid, &devid, &fnid)) {
> +            continue;
> +        }
> +
> +        sprintf(tmp, "%s%s/class", iommupath, entry->d_name);
> +        printf("Reading device class from %s\n", tmp);
> +
> +        deviceclassfile = fopen(tmp, "r");
> +        if (deviceclassfile) {
> +            fscanf(deviceclassfile, "%x", &deviceclass);
> +            fclose(deviceclassfile);
> +        }
> +        if (!deviceclass) {
> +            continue;
> +        }
> +#define PCI_BASE_CLASS_BRIDGE           0x06
> +        if ((phb->scan < 2) && ((deviceclass >> 16) == PCI_BASE_CLASS_BRIDGE)) {
> +            continue;
> +        }
> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
> +            /* Tweak USB */
> +            phb->force_addr = 1;
> +            phb->enable_multifunction = 1;
> +        }
> +
> +        printf("Creating device %X:%X:%X.%x class=0x%X\n",
> +               domainid, busid, devid, fnid, deviceclass);
> +
> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
> +        if (!dev) {
> +            fprintf(stderr, "failed to create vfio-pci\n");
> +            continue;
> +        }
> +        qdev_prop_parse(dev, "host", entry->d_name);
> +        if (phb->force_addr) {
> +            sprintf(addr, "%X.%X", devid, fnid);
> +            qdev_prop_parse(dev, "addr", addr);
> +        }
> +        if (phb->enable_multifunction) {
> +            qdev_prop_set_bit(dev, "multifunction", 1);
> +        }
> +        qdev_init_nofail(dev);
> +    }
> +    closedir(dirp);
> +
> +    return 0;
> +}
> +
>  static int spapr_phb_init(SysBusDevice *s)
>  {
>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
> @@ -567,15 +652,13 @@ static int spapr_phb_init(SysBusDevice *s)
>  
>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>                             phb->busname ? phb->busname : phb->dtbusname,
> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> +                           pci_spapr_map_irq, phb,
>                             &phb->memspace, &phb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>      phb->host_state.bus = bus;
>  
>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
> -    phb->dma_window_start = 0;
> -    phb->dma_window_size = 0x40000000;
> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>  
>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
> @@ -588,6 +671,24 @@ static int spapr_phb_init(SysBusDevice *s)
>          }
>      }
>  
> +    if (phb->iommugroupid >= 0) {
> +        if (0 > spapr_pci_scan_vfio(phb)) {
> +            return -1;
> +        }
> +        if (!phb->iommugroup || !phb->iommugroup->container) {
> +            return -1;
> +        }
> +        spapr_vfio_init_dma(phb->iommugroup->container->fd, phb->dma_liobn,

I'm not really a fan of this approach, the vfio data structures are not
designed for external use.  Perhaps they should be pulled into
vfio_pci.c.

Could we instead just make a group service function, something like:

int vfio_group_iommu_ioctl(int iommu_group, int request, ...)

Then it could just be a passthrough and you don't have to keep an fd or
de-reference vfio structures.  There's a little overhead that we have to
lookup the group each time, but this is your slow method anyway and how
many groups are going to be attached to a single guest.

> +                            &phb->dma_window_start,
> +                            &phb->dma_window_size);
> +        return 0;
> +    }
> +
> +    phb->dma_window_start = 0;
> +    phb->dma_window_size = 0x40000000;
> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> +                                         phb->dma_window_size);
> +
>      return 0;
>  }
>  
> @@ -599,6 +700,10 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), /* 0 don't 1 +devices 2 +buses */
> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> index 145071c..1953a74 100644
> --- a/hw/spapr_pci.h
> +++ b/hw/spapr_pci.h
> @@ -57,6 +57,11 @@ typedef struct sPAPRPHBState {
>          int nvec;
>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>  
> +    int32_t iommugroupid;
> +    struct VFIOGroup *iommugroup;
> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
> +    uint8_t enable_multifunction, force_addr;
> +
>      QLIST_ENTRY(sPAPRPHBState) list;
>  } sPAPRPHBState;
>  
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 1ac287f..73681fb 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -21,7 +21,6 @@
>  #include <dirent.h>
>  #include <stdio.h>
>  #include <unistd.h>
> -#include <sys/io.h>
>  #include <sys/ioctl.h>
>  #include <sys/mman.h>
>  #include <sys/types.h>
> @@ -43,6 +42,12 @@
>  #include "range.h"
>  #include "vfio_pci.h"
>  #include "linux-vfio.h"
> +#ifndef TARGET_PPC64
> +#include <sys/io.h>
> +#else
> +#include "hw/pci_internals.h"
> +#include "hw/spapr.h"
> +#endif
>  
>  //#define DEBUG_VFIO
>  #ifdef DEBUG_VFIO
> @@ -1581,6 +1586,25 @@ static int vfio_connect_container(VFIOGroup *group)
>  
>          memory_listener_register(&container->listener, get_system_memory());
>  
> +#define POWERPC_IOMMU           2
> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> +        if (ret) {
> +            error_report("vfio: failed to set group container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
> +
> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
> +        if (ret) {
> +            error_report("vfio: failed to set iommu for container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
>      } else {
>          error_report("vfio: No available IOMMU models\n");
>          g_free(container);
> @@ -1620,7 +1644,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
>      }
>  }
>  
> -static VFIOGroup *vfio_get_group(int groupid)
> +VFIOGroup *vfio_get_group(int groupid)
>  {
>      VFIOGroup *group;
>      char path[32];
> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> index 226607c..d63dd63 100644
> --- a/hw/vfio_pci.h
> +++ b/hw/vfio_pci.h
> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>  
> +VFIOGroup *vfio_get_group(int groupid);
> +
>  #endif /* __VFIO_H__ */
Alexey Kardashevskiy - July 13, 2012, 5:03 a.m.
On 13/07/12 13:47, Alex Williamson wrote:
> On Thu, 2012-07-12 at 18:52 +1000, Alexey Kardashevskiy wrote:
>> It literally does the following:
>>
>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>
>> 2. The patch assumes that IOAPIC calls are going to be replaced
>> with something generic. I have something in my local git but it's
>> too early, we need to extend PCIINTxRoute first.
>>
>> 3. vfio_get_group() made public. I want to open IOMMU group from
>> the sPAPR code to have everything I need for VFIO on sPAPR and
>> avoid ugly workarounds with finilizing PHB setup on sPAPR.
>>
>> 4. Change sPAPR PHB to scan the PCI bus which is used for
>> the IOMMU-VFIO group. Now it is enough to add the following to
>> the QEMU command line to get VFIO up with all the devices from
>> IOMMU group with id=3:
>> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
>> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  hw/ppc/Makefile.objs |    3 ++
>>  hw/spapr.h           |    4 ++
>>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++
>>  hw/spapr_pci.c       |  115 +++++++++++++++++++++++++++++++++++++++++++++++---
>>  hw/spapr_pci.h       |    5 +++
>>  hw/vfio_pci.c        |   28 +++++++++++-
>>  hw/vfio_pci.h        |    2 +
>>  7 files changed, 237 insertions(+), 7 deletions(-)
>>
>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>> index f573a95..c46a049 100644
>> --- a/hw/ppc/Makefile.objs
>> +++ b/hw/ppc/Makefile.objs
>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>>  # Xilinx PPC peripherals
>>  obj-y += xilinx_ethlite.o
>>  
>> +# VFIO PCI device assignment
>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>> +
>>  obj-y := $(addprefix ../,$(obj-y))
>> diff --git a/hw/spapr.h b/hw/spapr.h
>> index b37f337..9dca704 100644
>> --- a/hw/spapr.h
>> +++ b/hw/spapr.h
>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>                        DMAContext *dma);
>>  
>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size);
>> +
>>  #endif /* !defined (__HW_SPAPR_H__) */
>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
>> index 50c288d..0a194e8 100644
>> --- a/hw/spapr_iommu.c
>> +++ b/hw/spapr_iommu.c
>> @@ -16,6 +16,8 @@
>>   * You should have received a copy of the GNU Lesser General Public
>>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>>   */
>> +#include <sys/ioctl.h>
>> +
>>  #include "hw.h"
>>  #include "kvm.h"
>>  #include "qdev.h"
>> @@ -23,6 +25,7 @@
>>  #include "dma.h"
>>  
>>  #include "hw/spapr.h"
>> +#include "hw/linux-vfio.h"
>>  
>>  #include <libfdt.h>
>>  
>> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>>      return 0;
>>  }
>>  
>> +/* -------- API for POWERPC IOMMU -------- */
>> +
>> +#define POWERPC_IOMMU           2
>> +
>> +struct tce_iommu_info {
>> +    __u32 argsz;
>> +    __u32 dma32_window_start;
>> +    __u32 dma32_window_size;
>> +};
>> +
>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +struct tce_iommu_dma_map {
>> +    __u32 argsz;
>> +    __u64 va;
>> +    __u64 dmaaddr;
>> +};
>> +
>> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
>> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>> +
>> +typedef struct sPAPRVFIOTable {
>> +    int fd;
>> +    uint32_t liobn;
>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
>> +} sPAPRVFIOTable;
>> +
>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
>> +
>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
>> +
>> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
>> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
>> +        return;
>> +    }
>> +    *dma32_window_start = info.dma32_window_start;
>> +    *dma32_window_size = info.dma32_window_size;
>> +
>> +    t = g_malloc0(sizeof(*t));
>> +    t->fd = fd;
>> +    t->liobn = liobn;
>> +
>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
>> +}
>> +
>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_dma_map map = {
>> +        .argsz = sizeof(map),
>> +        .va = 0,
>> +        .dmaaddr = ioba,
>> +    };
>> +
>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
>> +        if (t->liobn != liobn) {
>> +            continue;
>> +        }
>> +        if (tce) {
>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
>> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
>> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
>> +                return H_PARAMETER;
>> +            }
>> +        } else {
>> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
>> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
>> +                return H_PARAMETER;
>> +            }
>> +        }
>> +        return H_SUCCESS;
>> +    }
>> +    return H_CONTINUE; /* positive non-zero value */
>> +}
>> +
>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>                                target_ulong opcode, target_ulong *args)
>>  {
>> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>      if (0 >= ret) {
>>          return ret ? H_PARAMETER : H_SUCCESS;
>>      }
>> +    ret = put_tce_vfio(liobn, ioba, tce);
>> +    if (0 >= ret) {
>> +        return ret ? H_PARAMETER : H_SUCCESS;
>> +    }
>>  #ifdef DEBUG_TCE
>>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>> index 014297b..92c48b6 100644
>> --- a/hw/spapr_pci.c
>> +++ b/hw/spapr_pci.c
>> @@ -22,6 +22,9 @@
>>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>>   * THE SOFTWARE.
>>   */
>> +#include <sys/types.h>
>> +#include <dirent.h>
>> +
>>  #include "hw.h"
>>  #include "pci.h"
>>  #include "msi.h"
>> @@ -29,10 +32,10 @@
>>  #include "pci_host.h"
>>  #include "hw/spapr.h"
>>  #include "hw/spapr_pci.h"
>> +#include "hw/vfio_pci.h"
> 
> :-\
> 
>>  #include "exec-memory.h"
>>  #include <libfdt.h>
>>  #include "trace.h"
>> -
>>  #include "hw/pci_internals.h"
>>  
>>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
>> @@ -440,6 +443,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>>                   level);
>>  }
>>  
>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
>> +{
>> +    sPAPRPHBState *phb = opaque;
>> +    return phb->lsi_table[irq_num].dt_irq;
>> +}
>> +
>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>>                                unsigned size)
>>  {
>> @@ -515,6 +524,82 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>>      return phb->dma;
>>  }
>>  
>> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
>> +{
>> +    char iommupath[256];
>> +    DIR *dirp;
>> +    struct dirent *entry;
>> +
>> +    phb->iommugroup = vfio_get_group(phb->iommugroupid);
>> +    if (!phb->iommugroup) {
>> +        fprintf(stderr, "Cannot open IOMMU group %d\n", phb->iommugroupid);
>> +        return -1;
>> +    }
>> +
>> +    if (!phb->scan) {
>> +        printf("Autoscan disabled\n");
>> +        return 0;
>> +    }
>> +
>> +    sprintf(iommupath, "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
>> +    dirp = opendir(iommupath);
>> +
>> +    while ((entry = readdir(dirp)) != NULL) {
>> +        char *tmp = alloca(strlen(iommupath) + strlen(entry->d_name) + 32);
>> +        FILE *deviceclassfile;
>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
>> +        char addr[32];
>> +        DeviceState *dev;
>> +
>> +        if (4 != sscanf(entry->d_name, "%X:%X:%X.%x",
>> +                        &domainid, &busid, &devid, &fnid)) {
>> +            continue;
>> +        }
>> +
>> +        sprintf(tmp, "%s%s/class", iommupath, entry->d_name);
>> +        printf("Reading device class from %s\n", tmp);
>> +
>> +        deviceclassfile = fopen(tmp, "r");
>> +        if (deviceclassfile) {
>> +            fscanf(deviceclassfile, "%x", &deviceclass);
>> +            fclose(deviceclassfile);
>> +        }
>> +        if (!deviceclass) {
>> +            continue;
>> +        }
>> +#define PCI_BASE_CLASS_BRIDGE           0x06
>> +        if ((phb->scan < 2) && ((deviceclass >> 16) == PCI_BASE_CLASS_BRIDGE)) {
>> +            continue;
>> +        }
>> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
>> +            /* Tweak USB */
>> +            phb->force_addr = 1;
>> +            phb->enable_multifunction = 1;
>> +        }
>> +
>> +        printf("Creating device %X:%X:%X.%x class=0x%X\n",
>> +               domainid, busid, devid, fnid, deviceclass);
>> +
>> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
>> +        if (!dev) {
>> +            fprintf(stderr, "failed to create vfio-pci\n");
>> +            continue;
>> +        }
>> +        qdev_prop_parse(dev, "host", entry->d_name);
>> +        if (phb->force_addr) {
>> +            sprintf(addr, "%X.%X", devid, fnid);
>> +            qdev_prop_parse(dev, "addr", addr);
>> +        }
>> +        if (phb->enable_multifunction) {
>> +            qdev_prop_set_bit(dev, "multifunction", 1);
>> +        }
>> +        qdev_init_nofail(dev);
>> +    }
>> +    closedir(dirp);
>> +
>> +    return 0;
>> +}
>> +
>>  static int spapr_phb_init(SysBusDevice *s)
>>  {
>>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
>> @@ -567,15 +652,13 @@ static int spapr_phb_init(SysBusDevice *s)
>>  
>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>>                             phb->busname ? phb->busname : phb->dtbusname,
>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
>> +                           pci_spapr_map_irq, phb,
>>                             &phb->memspace, &phb->iospace,
>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>>      phb->host_state.bus = bus;
>>  
>>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
>> -    phb->dma_window_start = 0;
>> -    phb->dma_window_size = 0x40000000;
>> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>>  
>>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
>> @@ -588,6 +671,24 @@ static int spapr_phb_init(SysBusDevice *s)
>>          }
>>      }
>>  
>> +    if (phb->iommugroupid >= 0) {
>> +        if (0 > spapr_pci_scan_vfio(phb)) {
>> +            return -1;
>> +        }
>> +        if (!phb->iommugroup || !phb->iommugroup->container) {
>> +            return -1;
>> +        }
>> +        spapr_vfio_init_dma(phb->iommugroup->container->fd, phb->dma_liobn,
> 
> I'm not really a fan of this approach, the vfio data structures are not
> designed for external use.  Perhaps they should be pulled into
> vfio_pci.c.

Right. I do not really want to know what is inside.


> Could we instead just make a group service function, something like:
> 
> int vfio_group_iommu_ioctl(int iommu_group, int request, ...)
> 
> Then it could just be a passthrough and you don't have to keep an fd or
> de-reference vfio structures.  There's a little overhead that we have to
> lookup the group each time, but this is your slow method anyway and how
> many groups are going to be attached to a single guest.


This solves problem of vfio_pci.h inclusion but does not solve the problem what gets created first -
VFIO PCI device or IOMMU group, and I also need to know IOMMU group id to use the API you proposed.

We could implement an QEMU IOMMU device which would take an IOMMU group id and do what
vfio_get_group() does.

The main idea is that on powerpc we know in advance which device belongs which group and we are
going to put the whole group (may be except bridges) to QEMU so it is more logical to start from
creating a IOMMU group rather than VFIO PCI device.


Lookup is not the issue here as we can make a fd array where IOMMU group id is an index as they
start from 0 and go consequently, do not expect too many of them :) Or we could save this fd
somewhere in the IOMMU device and get it from there somehow (there is no ready API to read
properties though).




>> +                            &phb->dma_window_start,
>> +                            &phb->dma_window_size);
>> +        return 0;
>> +    }
>> +
>> +    phb->dma_window_start = 0;
>> +    phb->dma_window_size = 0x40000000;
>> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
>> +                                         phb->dma_window_size);
>> +
>>      return 0;
>>  }
>>  
>> @@ -599,6 +700,10 @@ static Property spapr_phb_properties[] = {
>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), /* 0 don't 1 +devices 2 +buses */
>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>  
>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
>> index 145071c..1953a74 100644
>> --- a/hw/spapr_pci.h
>> +++ b/hw/spapr_pci.h
>> @@ -57,6 +57,11 @@ typedef struct sPAPRPHBState {
>>          int nvec;
>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>>  
>> +    int32_t iommugroupid;
>> +    struct VFIOGroup *iommugroup;
>> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
>> +    uint8_t enable_multifunction, force_addr;
>> +
>>      QLIST_ENTRY(sPAPRPHBState) list;
>>  } sPAPRPHBState;
>>  
>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
>> index 1ac287f..73681fb 100644
>> --- a/hw/vfio_pci.c
>> +++ b/hw/vfio_pci.c
>> @@ -21,7 +21,6 @@
>>  #include <dirent.h>
>>  #include <stdio.h>
>>  #include <unistd.h>
>> -#include <sys/io.h>
>>  #include <sys/ioctl.h>
>>  #include <sys/mman.h>
>>  #include <sys/types.h>
>> @@ -43,6 +42,12 @@
>>  #include "range.h"
>>  #include "vfio_pci.h"
>>  #include "linux-vfio.h"
>> +#ifndef TARGET_PPC64
>> +#include <sys/io.h>
>> +#else
>> +#include "hw/pci_internals.h"
>> +#include "hw/spapr.h"
>> +#endif
>>  
>>  //#define DEBUG_VFIO
>>  #ifdef DEBUG_VFIO
>> @@ -1581,6 +1586,25 @@ static int vfio_connect_container(VFIOGroup *group)
>>  
>>          memory_listener_register(&container->listener, get_system_memory());
>>  
>> +#define POWERPC_IOMMU           2
>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>> +        if (ret) {
>> +            error_report("vfio: failed to set group container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>> +
>> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
>> +        if (ret) {
>> +            error_report("vfio: failed to set iommu for container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>>      } else {
>>          error_report("vfio: No available IOMMU models\n");
>>          g_free(container);
>> @@ -1620,7 +1644,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
>>      }
>>  }
>>  
>> -static VFIOGroup *vfio_get_group(int groupid)
>> +VFIOGroup *vfio_get_group(int groupid)
>>  {
>>      VFIOGroup *group;
>>      char path[32];
>> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
>> index 226607c..d63dd63 100644
>> --- a/hw/vfio_pci.h
>> +++ b/hw/vfio_pci.h
>> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>>  
>> +VFIOGroup *vfio_get_group(int groupid);
>> +
>>  #endif /* __VFIO_H__ */
> 
> 
>
Alexey Kardashevskiy - July 13, 2012, 5:24 a.m.
Two comments below.

On 13/07/12 06:54, Blue Swirl wrote:
> On Thu, Jul 12, 2012 at 8:52 AM, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>> It literally does the following:
>>
>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>
>> 2. The patch assumes that IOAPIC calls are going to be replaced
>> with something generic. I have something in my local git but it's
>> too early, we need to extend PCIINTxRoute first.
>>
>> 3. vfio_get_group() made public. I want to open IOMMU group from
>> the sPAPR code to have everything I need for VFIO on sPAPR and
>> avoid ugly workarounds with finilizing PHB setup on sPAPR.
>>
>> 4. Change sPAPR PHB to scan the PCI bus which is used for
>> the IOMMU-VFIO group. Now it is enough to add the following to
>> the QEMU command line to get VFIO up with all the devices from
>> IOMMU group with id=3:
>> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
>> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  hw/ppc/Makefile.objs |    3 ++
>>  hw/spapr.h           |    4 ++
>>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++
>>  hw/spapr_pci.c       |  115 +++++++++++++++++++++++++++++++++++++++++++++++---
>>  hw/spapr_pci.h       |    5 +++
>>  hw/vfio_pci.c        |   28 +++++++++++-
>>  hw/vfio_pci.h        |    2 +
>>  7 files changed, 237 insertions(+), 7 deletions(-)
>>
>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>> index f573a95..c46a049 100644
>> --- a/hw/ppc/Makefile.objs
>> +++ b/hw/ppc/Makefile.objs
>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>>  # Xilinx PPC peripherals
>>  obj-y += xilinx_ethlite.o
>>
>> +# VFIO PCI device assignment
>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>> +
>>  obj-y := $(addprefix ../,$(obj-y))
>> diff --git a/hw/spapr.h b/hw/spapr.h
>> index b37f337..9dca704 100644
>> --- a/hw/spapr.h
>> +++ b/hw/spapr.h
>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>                        DMAContext *dma);
>>
>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size);
>> +
>>  #endif /* !defined (__HW_SPAPR_H__) */
>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
>> index 50c288d..0a194e8 100644
>> --- a/hw/spapr_iommu.c
>> +++ b/hw/spapr_iommu.c
>> @@ -16,6 +16,8 @@
>>   * You should have received a copy of the GNU Lesser General Public
>>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>>   */
>> +#include <sys/ioctl.h>
>> +
>>  #include "hw.h"
>>  #include "kvm.h"
>>  #include "qdev.h"
>> @@ -23,6 +25,7 @@
>>  #include "dma.h"
>>
>>  #include "hw/spapr.h"
>> +#include "hw/linux-vfio.h"
>>
>>  #include <libfdt.h>
>>
>> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>>      return 0;
>>  }
>>
>> +/* -------- API for POWERPC IOMMU -------- */
>> +
>> +#define POWERPC_IOMMU           2
>> +
>> +struct tce_iommu_info {
> 
> CamelCase.
> 
>> +    __u32 argsz;
>> +    __u32 dma32_window_start;
>> +    __u32 dma32_window_size;
> 
> Please use uint32_t.
> 
>> +};
>> +
>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +struct tce_iommu_dma_map {
>> +    __u32 argsz;
> 
> The structure may or may not be padded here since there's no
> QEMU_PACKED attribute. If possible, just rearrange the fields.
> 
>> +    __u64 va;
>> +    __u64 dmaaddr;
>> +};
>> +
>> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
>> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>> +
>> +typedef struct sPAPRVFIOTable {
>> +    int fd;
>> +    uint32_t liobn;
>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
>> +} sPAPRVFIOTable;
>> +
>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
>> +
>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
>> +
>> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
>> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
>> +        return;
>> +    }
>> +    *dma32_window_start = info.dma32_window_start;
>> +    *dma32_window_size = info.dma32_window_size;
>> +
>> +    t = g_malloc0(sizeof(*t));
>> +    t->fd = fd;
>> +    t->liobn = liobn;
>> +
>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
>> +}
>> +
>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_dma_map map = {
>> +        .argsz = sizeof(map),
>> +        .va = 0,
>> +        .dmaaddr = ioba,
>> +    };
>> +
>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
>> +        if (t->liobn != liobn) {
>> +            continue;
>> +        }
>> +        if (tce) {
>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
>> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
>> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
> 
> perror()?
> 
>> +                return H_PARAMETER;
>> +            }
>> +        } else {
>> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
>> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
>> +                return H_PARAMETER;
>> +            }
>> +        }
>> +        return H_SUCCESS;
>> +    }
>> +    return H_CONTINUE; /* positive non-zero value */
>> +}
>> +
>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>                                target_ulong opcode, target_ulong *args)
>>  {
>> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>      if (0 >= ret) {
>>          return ret ? H_PARAMETER : H_SUCCESS;
>>      }
>> +    ret = put_tce_vfio(liobn, ioba, tce);
>> +    if (0 >= ret) {
> 
> This order in expressions is not common, please reverse.
> 
>> +        return ret ? H_PARAMETER : H_SUCCESS;
>> +    }
>>  #ifdef DEBUG_TCE
>>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>> index 014297b..92c48b6 100644
>> --- a/hw/spapr_pci.c
>> +++ b/hw/spapr_pci.c
>> @@ -22,6 +22,9 @@
>>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>>   * THE SOFTWARE.
>>   */
>> +#include <sys/types.h>
>> +#include <dirent.h>
>> +
>>  #include "hw.h"
>>  #include "pci.h"
>>  #include "msi.h"
>> @@ -29,10 +32,10 @@
>>  #include "pci_host.h"
>>  #include "hw/spapr.h"
>>  #include "hw/spapr_pci.h"
>> +#include "hw/vfio_pci.h"
>>  #include "exec-memory.h"
>>  #include <libfdt.h>
>>  #include "trace.h"
>> -
>>  #include "hw/pci_internals.h"
>>
>>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
>> @@ -440,6 +443,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>>                   level);
>>  }
>>
>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
>> +{
>> +    sPAPRPHBState *phb = opaque;
>> +    return phb->lsi_table[irq_num].dt_irq;
>> +}
>> +
>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>>                                unsigned size)
>>  {
>> @@ -515,6 +524,82 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>>      return phb->dma;
>>  }
>>
>> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
>> +{
>> +    char iommupath[256];
>> +    DIR *dirp;
>> +    struct dirent *entry;
>> +
>> +    phb->iommugroup = vfio_get_group(phb->iommugroupid);
>> +    if (!phb->iommugroup) {
>> +        fprintf(stderr, "Cannot open IOMMU group %d\n", phb->iommugroupid);
>> +        return -1;
>> +    }
>> +
>> +    if (!phb->scan) {
>> +        printf("Autoscan disabled\n");
>> +        return 0;
>> +    }
>> +
>> +    sprintf(iommupath, "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
> 
> Please use snprintf() or g_strdup_printf().
> 
> 
>> +    dirp = opendir(iommupath);
>> +
>> +    while ((entry = readdir(dirp)) != NULL) {
>> +        char *tmp = alloca(strlen(iommupath) + strlen(entry->d_name) + 32);
>> +        FILE *deviceclassfile;
>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
>> +        char addr[32];
>> +        DeviceState *dev;
>> +
>> +        if (4 != sscanf(entry->d_name, "%X:%X:%X.%x",
> 
> Please put the constant last.
> 
>> +                        &domainid, &busid, &devid, &fnid)) {
>> +            continue;
>> +        }
>> +
>> +        sprintf(tmp, "%s%s/class", iommupath, entry->d_name);
> 
> Again, snprintf() or g_strdup_printf() (which avoids the alloca() too).
> 
>> +        printf("Reading device class from %s\n", tmp);
> 
> Leftover debugging?
> 
>> +
>> +        deviceclassfile = fopen(tmp, "r");
>> +        if (deviceclassfile) {
>> +            fscanf(deviceclassfile, "%x", &deviceclass);
>> +            fclose(deviceclassfile);
>> +        }
>> +        if (!deviceclass) {
>> +            continue;
>> +        }
>> +#define PCI_BASE_CLASS_BRIDGE           0x06
> 
> This belongs to pci_ids.h.


It should but it is not. It is way smaller than the same one in the kernel.



>> +        if ((phb->scan < 2) && ((deviceclass >> 16) == PCI_BASE_CLASS_BRIDGE)) {
>> +            continue;
>> +        }
>> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
>> +            /* Tweak USB */
>> +            phb->force_addr = 1;
>> +            phb->enable_multifunction = 1;
>> +        }
>> +
>> +        printf("Creating device %X:%X:%X.%x class=0x%X\n",
>> +               domainid, busid, devid, fnid, deviceclass);
> 
> Lower case hex, please.
> 
>> +
>> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
>> +        if (!dev) {
>> +            fprintf(stderr, "failed to create vfio-pci\n");
>> +            continue;
>> +        }
>> +        qdev_prop_parse(dev, "host", entry->d_name);
>> +        if (phb->force_addr) {
>> +            sprintf(addr, "%X.%X", devid, fnid);
> 
> snprintf, lower case hex.
> 
>> +            qdev_prop_parse(dev, "addr", addr);
>> +        }
>> +        if (phb->enable_multifunction) {
>> +            qdev_prop_set_bit(dev, "multifunction", 1);
>> +        }
>> +        qdev_init_nofail(dev);
>> +    }
>> +    closedir(dirp);
>> +
>> +    return 0;
>> +}
>> +
>>  static int spapr_phb_init(SysBusDevice *s)
>>  {
>>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
>> @@ -567,15 +652,13 @@ static int spapr_phb_init(SysBusDevice *s)
>>
>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>>                             phb->busname ? phb->busname : phb->dtbusname,
>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
>> +                           pci_spapr_map_irq, phb,
>>                             &phb->memspace, &phb->iospace,
>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>>      phb->host_state.bus = bus;
>>
>>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
>> -    phb->dma_window_start = 0;
>> -    phb->dma_window_size = 0x40000000;
>> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>>
>>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
>> @@ -588,6 +671,24 @@ static int spapr_phb_init(SysBusDevice *s)
>>          }
>>      }
>>
>> +    if (phb->iommugroupid >= 0) {
>> +        if (0 > spapr_pci_scan_vfio(phb)) {
> 
> Order.
> 
>> +            return -1;
>> +        }
>> +        if (!phb->iommugroup || !phb->iommugroup->container) {
>> +            return -1;
>> +        }
>> +        spapr_vfio_init_dma(phb->iommugroup->container->fd, phb->dma_liobn,
>> +                            &phb->dma_window_start,
>> +                            &phb->dma_window_size);
>> +        return 0;
>> +    }
>> +
>> +    phb->dma_window_start = 0;
>> +    phb->dma_window_size = 0x40000000;
>> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
>> +                                         phb->dma_window_size);
>> +
>>      return 0;
>>  }
>>
>> @@ -599,6 +700,10 @@ static Property spapr_phb_properties[] = {
>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), /* 0 don't 1 +devices 2 +buses */
>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>
>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
>> index 145071c..1953a74 100644
>> --- a/hw/spapr_pci.h
>> +++ b/hw/spapr_pci.h
>> @@ -57,6 +57,11 @@ typedef struct sPAPRPHBState {
>>          int nvec;
>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>>
>> +    int32_t iommugroupid;
>> +    struct VFIOGroup *iommugroup;
>> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
>> +    uint8_t enable_multifunction, force_addr;
> 
> Use bool for both?



There is no DEFINE_PROP_BOOL. There is DEFINE_PROP_BIT but it works with bits, not bools.
uint8_t is the closest one.



>> +
>>      QLIST_ENTRY(sPAPRPHBState) list;
>>  } sPAPRPHBState;
>>
>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
>> index 1ac287f..73681fb 100644
>> --- a/hw/vfio_pci.c
>> +++ b/hw/vfio_pci.c
>> @@ -21,7 +21,6 @@
>>  #include <dirent.h>
>>  #include <stdio.h>
>>  #include <unistd.h>
>> -#include <sys/io.h>
>>  #include <sys/ioctl.h>
>>  #include <sys/mman.h>
>>  #include <sys/types.h>
>> @@ -43,6 +42,12 @@
>>  #include "range.h"
>>  #include "vfio_pci.h"
>>  #include "linux-vfio.h"
>> +#ifndef TARGET_PPC64
>> +#include <sys/io.h>
>> +#else
>> +#include "hw/pci_internals.h"
>> +#include "hw/spapr.h"
>> +#endif
>>
>>  //#define DEBUG_VFIO
>>  #ifdef DEBUG_VFIO
>> @@ -1581,6 +1586,25 @@ static int vfio_connect_container(VFIOGroup *group)
>>
>>          memory_listener_register(&container->listener, get_system_memory());
>>
>> +#define POWERPC_IOMMU           2
>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>> +        if (ret) {
>> +            error_report("vfio: failed to set group container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>> +
>> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
>> +        if (ret) {
>> +            error_report("vfio: failed to set iommu for container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>>      } else {
>>          error_report("vfio: No available IOMMU models\n");
>>          g_free(container);
>> @@ -1620,7 +1644,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
>>      }
>>  }
>>
>> -static VFIOGroup *vfio_get_group(int groupid)
>> +VFIOGroup *vfio_get_group(int groupid)
>>  {
>>      VFIOGroup *group;
>>      char path[32];
>> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
>> index 226607c..d63dd63 100644
>> --- a/hw/vfio_pci.h
>> +++ b/hw/vfio_pci.h
>> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>>
>> +VFIOGroup *vfio_get_group(int groupid);
>> +
>>  #endif /* __VFIO_H__ */
>> --
>> 1.7.10
>>
>>
Alexey Kardashevskiy - July 13, 2012, 5:31 a.m.
On 13/07/12 08:35, Scott Wood wrote:
> On 07/12/2012 03:52 AM, Alexey Kardashevskiy wrote:
>> +/* -------- API for POWERPC IOMMU -------- */
>> +
>> +#define POWERPC_IOMMU           2
>> +
>> +struct tce_iommu_info {
>> +    __u32 argsz;
>> +    __u32 dma32_window_start;
>> +    __u32 dma32_window_size;
>> +};
>> +
>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +struct tce_iommu_dma_map {
>> +    __u32 argsz;
>> +    __u64 va;
>> +    __u64 dmaaddr;
>> +};
>> +
>> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
>> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
> 
> I thought you were going to change the name to be less generic...


I will change them indeed, I am just focused now on other things such as EOI handlers and order of
devices creation. Next iteration will be fixed.
Blue Swirl - July 13, 2012, 2:33 p.m.
On Fri, Jul 13, 2012 at 5:24 AM, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> Two comments below.
>
> On 13/07/12 06:54, Blue Swirl wrote:
>> On Thu, Jul 12, 2012 at 8:52 AM, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>> It literally does the following:
>>>
>>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>>
>>> 2. The patch assumes that IOAPIC calls are going to be replaced
>>> with something generic. I have something in my local git but it's
>>> too early, we need to extend PCIINTxRoute first.
>>>
>>> 3. vfio_get_group() made public. I want to open IOMMU group from
>>> the sPAPR code to have everything I need for VFIO on sPAPR and
>>> avoid ugly workarounds with finilizing PHB setup on sPAPR.
>>>
>>> 4. Change sPAPR PHB to scan the PCI bus which is used for
>>> the IOMMU-VFIO group. Now it is enough to add the following to
>>> the QEMU command line to get VFIO up with all the devices from
>>> IOMMU group with id=3:
>>> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
>>> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
>>>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> ---
>>>  hw/ppc/Makefile.objs |    3 ++
>>>  hw/spapr.h           |    4 ++
>>>  hw/spapr_iommu.c     |   87 ++++++++++++++++++++++++++++++++++++++
>>>  hw/spapr_pci.c       |  115 +++++++++++++++++++++++++++++++++++++++++++++++---
>>>  hw/spapr_pci.h       |    5 +++
>>>  hw/vfio_pci.c        |   28 +++++++++++-
>>>  hw/vfio_pci.h        |    2 +
>>>  7 files changed, 237 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>>> index f573a95..c46a049 100644
>>> --- a/hw/ppc/Makefile.objs
>>> +++ b/hw/ppc/Makefile.objs
>>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>>>  # Xilinx PPC peripherals
>>>  obj-y += xilinx_ethlite.o
>>>
>>> +# VFIO PCI device assignment
>>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>>> +
>>>  obj-y := $(addprefix ../,$(obj-y))
>>> diff --git a/hw/spapr.h b/hw/spapr.h
>>> index b37f337..9dca704 100644
>>> --- a/hw/spapr.h
>>> +++ b/hw/spapr.h
>>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>>                        DMAContext *dma);
>>>
>>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>>> +                         uint64_t *dma32_window_start,
>>> +                         uint64_t *dma32_window_size);
>>> +
>>>  #endif /* !defined (__HW_SPAPR_H__) */
>>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
>>> index 50c288d..0a194e8 100644
>>> --- a/hw/spapr_iommu.c
>>> +++ b/hw/spapr_iommu.c
>>> @@ -16,6 +16,8 @@
>>>   * You should have received a copy of the GNU Lesser General Public
>>>   * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>>>   */
>>> +#include <sys/ioctl.h>
>>> +
>>>  #include "hw.h"
>>>  #include "kvm.h"
>>>  #include "qdev.h"
>>> @@ -23,6 +25,7 @@
>>>  #include "dma.h"
>>>
>>>  #include "hw/spapr.h"
>>> +#include "hw/linux-vfio.h"
>>>
>>>  #include <libfdt.h>
>>>
>>> @@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>>>      return 0;
>>>  }
>>>
>>> +/* -------- API for POWERPC IOMMU -------- */
>>> +
>>> +#define POWERPC_IOMMU           2
>>> +
>>> +struct tce_iommu_info {
>>
>> CamelCase.
>>
>>> +    __u32 argsz;
>>> +    __u32 dma32_window_start;
>>> +    __u32 dma32_window_size;
>>
>> Please use uint32_t.
>>
>>> +};
>>> +
>>> +#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>>> +
>>> +struct tce_iommu_dma_map {
>>> +    __u32 argsz;
>>
>> The structure may or may not be padded here since there's no
>> QEMU_PACKED attribute. If possible, just rearrange the fields.
>>
>>> +    __u64 va;
>>> +    __u64 dmaaddr;
>>> +};
>>> +
>>> +#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
>>> +#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
>>> +
>>> +typedef struct sPAPRVFIOTable {
>>> +    int fd;
>>> +    uint32_t liobn;
>>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
>>> +} sPAPRVFIOTable;
>>> +
>>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
>>> +
>>> +void spapr_vfio_init_dma(int fd, uint32_t liobn,
>>> +                         uint64_t *dma32_window_start,
>>> +                         uint64_t *dma32_window_size)
>>> +{
>>> +    sPAPRVFIOTable *t;
>>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
>>> +
>>> +    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
>>> +        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
>>> +        return;
>>> +    }
>>> +    *dma32_window_start = info.dma32_window_start;
>>> +    *dma32_window_size = info.dma32_window_size;
>>> +
>>> +    t = g_malloc0(sizeof(*t));
>>> +    t->fd = fd;
>>> +    t->liobn = liobn;
>>> +
>>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
>>> +}
>>> +
>>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
>>> +{
>>> +    sPAPRVFIOTable *t;
>>> +    struct tce_iommu_dma_map map = {
>>> +        .argsz = sizeof(map),
>>> +        .va = 0,
>>> +        .dmaaddr = ioba,
>>> +    };
>>> +
>>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
>>> +        if (t->liobn != liobn) {
>>> +            continue;
>>> +        }
>>> +        if (tce) {
>>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
>>> +            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
>>> +                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
>>
>> perror()?
>>
>>> +                return H_PARAMETER;
>>> +            }
>>> +        } else {
>>> +            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
>>> +                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
>>> +                return H_PARAMETER;
>>> +            }
>>> +        }
>>> +        return H_SUCCESS;
>>> +    }
>>> +    return H_CONTINUE; /* positive non-zero value */
>>> +}
>>> +
>>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>>                                target_ulong opcode, target_ulong *args)
>>>  {
>>> @@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>>      if (0 >= ret) {
>>>          return ret ? H_PARAMETER : H_SUCCESS;
>>>      }
>>> +    ret = put_tce_vfio(liobn, ioba, tce);
>>> +    if (0 >= ret) {
>>
>> This order in expressions is not common, please reverse.
>>
>>> +        return ret ? H_PARAMETER : H_SUCCESS;
>>> +    }
>>>  #ifdef DEBUG_TCE
>>>      fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
>>>              "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
>>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>>> index 014297b..92c48b6 100644
>>> --- a/hw/spapr_pci.c
>>> +++ b/hw/spapr_pci.c
>>> @@ -22,6 +22,9 @@
>>>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>>>   * THE SOFTWARE.
>>>   */
>>> +#include <sys/types.h>
>>> +#include <dirent.h>
>>> +
>>>  #include "hw.h"
>>>  #include "pci.h"
>>>  #include "msi.h"
>>> @@ -29,10 +32,10 @@
>>>  #include "pci_host.h"
>>>  #include "hw/spapr.h"
>>>  #include "hw/spapr_pci.h"
>>> +#include "hw/vfio_pci.h"
>>>  #include "exec-memory.h"
>>>  #include <libfdt.h>
>>>  #include "trace.h"
>>> -
>>>  #include "hw/pci_internals.h"
>>>
>>>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
>>> @@ -440,6 +443,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>>>                   level);
>>>  }
>>>
>>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
>>> +{
>>> +    sPAPRPHBState *phb = opaque;
>>> +    return phb->lsi_table[irq_num].dt_irq;
>>> +}
>>> +
>>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>>>                                unsigned size)
>>>  {
>>> @@ -515,6 +524,82 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>>>      return phb->dma;
>>>  }
>>>
>>> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
>>> +{
>>> +    char iommupath[256];
>>> +    DIR *dirp;
>>> +    struct dirent *entry;
>>> +
>>> +    phb->iommugroup = vfio_get_group(phb->iommugroupid);
>>> +    if (!phb->iommugroup) {
>>> +        fprintf(stderr, "Cannot open IOMMU group %d\n", phb->iommugroupid);
>>> +        return -1;
>>> +    }
>>> +
>>> +    if (!phb->scan) {
>>> +        printf("Autoscan disabled\n");
>>> +        return 0;
>>> +    }
>>> +
>>> +    sprintf(iommupath, "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
>>
>> Please use snprintf() or g_strdup_printf().
>>
>>
>>> +    dirp = opendir(iommupath);
>>> +
>>> +    while ((entry = readdir(dirp)) != NULL) {
>>> +        char *tmp = alloca(strlen(iommupath) + strlen(entry->d_name) + 32);
>>> +        FILE *deviceclassfile;
>>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
>>> +        char addr[32];
>>> +        DeviceState *dev;
>>> +
>>> +        if (4 != sscanf(entry->d_name, "%X:%X:%X.%x",
>>
>> Please put the constant last.
>>
>>> +                        &domainid, &busid, &devid, &fnid)) {
>>> +            continue;
>>> +        }
>>> +
>>> +        sprintf(tmp, "%s%s/class", iommupath, entry->d_name);
>>
>> Again, snprintf() or g_strdup_printf() (which avoids the alloca() too).
>>
>>> +        printf("Reading device class from %s\n", tmp);
>>
>> Leftover debugging?
>>
>>> +
>>> +        deviceclassfile = fopen(tmp, "r");
>>> +        if (deviceclassfile) {
>>> +            fscanf(deviceclassfile, "%x", &deviceclass);
>>> +            fclose(deviceclassfile);
>>> +        }
>>> +        if (!deviceclass) {
>>> +            continue;
>>> +        }
>>> +#define PCI_BASE_CLASS_BRIDGE           0x06
>>
>> This belongs to pci_ids.h.
>
>
> It should but it is not. It is way smaller than the same one in the kernel.

It's because the policy is that the file should be kept in synch with
kernel, except unused entries can be left out. So please add the entry
to pci_ids.h.

>
>
>
>>> +        if ((phb->scan < 2) && ((deviceclass >> 16) == PCI_BASE_CLASS_BRIDGE)) {
>>> +            continue;
>>> +        }
>>> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
>>> +            /* Tweak USB */
>>> +            phb->force_addr = 1;
>>> +            phb->enable_multifunction = 1;
>>> +        }
>>> +
>>> +        printf("Creating device %X:%X:%X.%x class=0x%X\n",
>>> +               domainid, busid, devid, fnid, deviceclass);
>>
>> Lower case hex, please.
>>
>>> +
>>> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
>>> +        if (!dev) {
>>> +            fprintf(stderr, "failed to create vfio-pci\n");
>>> +            continue;
>>> +        }
>>> +        qdev_prop_parse(dev, "host", entry->d_name);
>>> +        if (phb->force_addr) {
>>> +            sprintf(addr, "%X.%X", devid, fnid);
>>
>> snprintf, lower case hex.
>>
>>> +            qdev_prop_parse(dev, "addr", addr);
>>> +        }
>>> +        if (phb->enable_multifunction) {
>>> +            qdev_prop_set_bit(dev, "multifunction", 1);
>>> +        }
>>> +        qdev_init_nofail(dev);
>>> +    }
>>> +    closedir(dirp);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>  static int spapr_phb_init(SysBusDevice *s)
>>>  {
>>>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
>>> @@ -567,15 +652,13 @@ static int spapr_phb_init(SysBusDevice *s)
>>>
>>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>>>                             phb->busname ? phb->busname : phb->dtbusname,
>>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
>>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
>>> +                           pci_spapr_map_irq, phb,
>>>                             &phb->memspace, &phb->iospace,
>>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>>>      phb->host_state.bus = bus;
>>>
>>>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
>>> -    phb->dma_window_start = 0;
>>> -    phb->dma_window_size = 0x40000000;
>>> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>>>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>>>
>>>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
>>> @@ -588,6 +671,24 @@ static int spapr_phb_init(SysBusDevice *s)
>>>          }
>>>      }
>>>
>>> +    if (phb->iommugroupid >= 0) {
>>> +        if (0 > spapr_pci_scan_vfio(phb)) {
>>
>> Order.
>>
>>> +            return -1;
>>> +        }
>>> +        if (!phb->iommugroup || !phb->iommugroup->container) {
>>> +            return -1;
>>> +        }
>>> +        spapr_vfio_init_dma(phb->iommugroup->container->fd, phb->dma_liobn,
>>> +                            &phb->dma_window_start,
>>> +                            &phb->dma_window_size);
>>> +        return 0;
>>> +    }
>>> +
>>> +    phb->dma_window_start = 0;
>>> +    phb->dma_window_size = 0x40000000;
>>> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
>>> +                                         phb->dma_window_size);
>>> +
>>>      return 0;
>>>  }
>>>
>>> @@ -599,6 +700,10 @@ static Property spapr_phb_properties[] = {
>>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
>>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), /* 0 don't 1 +devices 2 +buses */
>>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
>>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>>>      DEFINE_PROP_END_OF_LIST(),
>>>  };
>>>
>>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
>>> index 145071c..1953a74 100644
>>> --- a/hw/spapr_pci.h
>>> +++ b/hw/spapr_pci.h
>>> @@ -57,6 +57,11 @@ typedef struct sPAPRPHBState {
>>>          int nvec;
>>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>>>
>>> +    int32_t iommugroupid;
>>> +    struct VFIOGroup *iommugroup;
>>> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
>>> +    uint8_t enable_multifunction, force_addr;
>>
>> Use bool for both?
>
>
>
> There is no DEFINE_PROP_BOOL. There is DEFINE_PROP_BIT but it works with bits, not bools.
> uint8_t is the closest one.
>
>
>
>>> +
>>>      QLIST_ENTRY(sPAPRPHBState) list;
>>>  } sPAPRPHBState;
>>>
>>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
>>> index 1ac287f..73681fb 100644
>>> --- a/hw/vfio_pci.c
>>> +++ b/hw/vfio_pci.c
>>> @@ -21,7 +21,6 @@
>>>  #include <dirent.h>
>>>  #include <stdio.h>
>>>  #include <unistd.h>
>>> -#include <sys/io.h>
>>>  #include <sys/ioctl.h>
>>>  #include <sys/mman.h>
>>>  #include <sys/types.h>
>>> @@ -43,6 +42,12 @@
>>>  #include "range.h"
>>>  #include "vfio_pci.h"
>>>  #include "linux-vfio.h"
>>> +#ifndef TARGET_PPC64
>>> +#include <sys/io.h>
>>> +#else
>>> +#include "hw/pci_internals.h"
>>> +#include "hw/spapr.h"
>>> +#endif
>>>
>>>  //#define DEBUG_VFIO
>>>  #ifdef DEBUG_VFIO
>>> @@ -1581,6 +1586,25 @@ static int vfio_connect_container(VFIOGroup *group)
>>>
>>>          memory_listener_register(&container->listener, get_system_memory());
>>>
>>> +#define POWERPC_IOMMU           2
>>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
>>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>>> +        if (ret) {
>>> +            error_report("vfio: failed to set group container: %s\n",
>>> +                         strerror(errno));
>>> +            g_free(container);
>>> +            close(fd);
>>> +            return -1;
>>> +        }
>>> +
>>> +        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
>>> +        if (ret) {
>>> +            error_report("vfio: failed to set iommu for container: %s\n",
>>> +                         strerror(errno));
>>> +            g_free(container);
>>> +            close(fd);
>>> +            return -1;
>>> +        }
>>>      } else {
>>>          error_report("vfio: No available IOMMU models\n");
>>>          g_free(container);
>>> @@ -1620,7 +1644,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
>>>      }
>>>  }
>>>
>>> -static VFIOGroup *vfio_get_group(int groupid)
>>> +VFIOGroup *vfio_get_group(int groupid)
>>>  {
>>>      VFIOGroup *group;
>>>      char path[32];
>>> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
>>> index 226607c..d63dd63 100644
>>> --- a/hw/vfio_pci.h
>>> +++ b/hw/vfio_pci.h
>>> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>>>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>>>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>>>
>>> +VFIOGroup *vfio_get_group(int groupid);
>>> +
>>>  #endif /* __VFIO_H__ */
>>> --
>>> 1.7.10
>>>
>>>
>
>
> --
> Alexey
>
>

Patch

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index f573a95..c46a049 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -25,4 +25,7 @@  obj-$(CONFIG_FDT) += ../device_tree.o
 # Xilinx PPC peripherals
 obj-y += xilinx_ethlite.o
 
+# VFIO PCI device assignment
+obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
+
 obj-y := $(addprefix ../,$(obj-y))
diff --git a/hw/spapr.h b/hw/spapr.h
index b37f337..9dca704 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -340,4 +340,8 @@  int spapr_dma_dt(void *fdt, int node_off, const char *propname,
 int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
                       DMAContext *dma);
 
+void spapr_vfio_init_dma(int fd, uint32_t liobn,
+                         uint64_t *dma32_window_start,
+                         uint64_t *dma32_window_size);
+
 #endif /* !defined (__HW_SPAPR_H__) */
diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
index 50c288d..0a194e8 100644
--- a/hw/spapr_iommu.c
+++ b/hw/spapr_iommu.c
@@ -16,6 +16,8 @@ 
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
+#include <sys/ioctl.h>
+
 #include "hw.h"
 #include "kvm.h"
 #include "qdev.h"
@@ -23,6 +25,7 @@ 
 #include "dma.h"
 
 #include "hw/spapr.h"
+#include "hw/linux-vfio.h"
 
 #include <libfdt.h>
 
@@ -183,6 +186,86 @@  static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
     return 0;
 }
 
+/* -------- API for POWERPC IOMMU -------- */
+
+#define POWERPC_IOMMU           2
+
+struct tce_iommu_info {
+    __u32 argsz;
+    __u32 dma32_window_start;
+    __u32 dma32_window_size;
+};
+
+#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct tce_iommu_dma_map {
+    __u32 argsz;
+    __u64 va;
+    __u64 dmaaddr;
+};
+
+#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
+#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+typedef struct sPAPRVFIOTable {
+    int fd;
+    uint32_t liobn;
+    QLIST_ENTRY(sPAPRVFIOTable) list;
+} sPAPRVFIOTable;
+
+QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
+
+void spapr_vfio_init_dma(int fd, uint32_t liobn,
+                         uint64_t *dma32_window_start,
+                         uint64_t *dma32_window_size)
+{
+    sPAPRVFIOTable *t;
+    struct tce_iommu_info info = { .argsz = sizeof(info) };
+
+    if (ioctl(fd, POWERPC_IOMMU_GET_INFO, &info)) {
+        fprintf(stderr, "POWERPC_IOMMU_GET_INFO failed %d\n", errno);
+        return;
+    }
+    *dma32_window_start = info.dma32_window_start;
+    *dma32_window_size = info.dma32_window_size;
+
+    t = g_malloc0(sizeof(*t));
+    t->fd = fd;
+    t->liobn = liobn;
+
+    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
+}
+
+static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
+{
+    sPAPRVFIOTable *t;
+    struct tce_iommu_dma_map map = {
+        .argsz = sizeof(map),
+        .va = 0,
+        .dmaaddr = ioba,
+    };
+
+    QLIST_FOREACH(t, &vfio_tce_tables, list) {
+        if (t->liobn != liobn) {
+            continue;
+        }
+        if (tce) {
+            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
+            if (ioctl(t->fd, POWERPC_IOMMU_MAP_DMA, &map)) {
+                fprintf(stderr, "TCE_MAP_DMA: %d\n", errno);
+                return H_PARAMETER;
+            }
+        } else {
+            if (ioctl(t->fd, POWERPC_IOMMU_UNMAP_DMA, &map)) {
+                fprintf(stderr, "TCE_UNMAP_DMA: %d\n", errno);
+                return H_PARAMETER;
+            }
+        }
+        return H_SUCCESS;
+    }
+    return H_CONTINUE; /* positive non-zero value */
+}
+
 static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
                               target_ulong opcode, target_ulong *args)
 {
@@ -203,6 +286,10 @@  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
     if (0 >= ret) {
         return ret ? H_PARAMETER : H_SUCCESS;
     }
+    ret = put_tce_vfio(liobn, ioba, tce);
+    if (0 >= ret) {
+        return ret ? H_PARAMETER : H_SUCCESS;
+    }
 #ifdef DEBUG_TCE
     fprintf(stderr, "%s on liobn=" TARGET_FMT_lx
             "  ioba 0x" TARGET_FMT_lx "  TCE 0x" TARGET_FMT_lx " ret=%d\n",
diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
index 014297b..92c48b6 100644
--- a/hw/spapr_pci.c
+++ b/hw/spapr_pci.c
@@ -22,6 +22,9 @@ 
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <sys/types.h>
+#include <dirent.h>
+
 #include "hw.h"
 #include "pci.h"
 #include "msi.h"
@@ -29,10 +32,10 @@ 
 #include "pci_host.h"
 #include "hw/spapr.h"
 #include "hw/spapr_pci.h"
+#include "hw/vfio_pci.h"
 #include "exec-memory.h"
 #include <libfdt.h>
 #include "trace.h"
-
 #include "hw/pci_internals.h"
 
 /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
@@ -440,6 +443,12 @@  static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
                  level);
 }
 
+static int pci_spapr_get_irq(void *opaque, int irq_num)
+{
+    sPAPRPHBState *phb = opaque;
+    return phb->lsi_table[irq_num].dt_irq;
+}
+
 static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
                               unsigned size)
 {
@@ -515,6 +524,82 @@  static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
     return phb->dma;
 }
 
+static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
+{
+    char iommupath[256];
+    DIR *dirp;
+    struct dirent *entry;
+
+    phb->iommugroup = vfio_get_group(phb->iommugroupid);
+    if (!phb->iommugroup) {
+        fprintf(stderr, "Cannot open IOMMU group %d\n", phb->iommugroupid);
+        return -1;
+    }
+
+    if (!phb->scan) {
+        printf("Autoscan disabled\n");
+        return 0;
+    }
+
+    sprintf(iommupath, "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
+    dirp = opendir(iommupath);
+
+    while ((entry = readdir(dirp)) != NULL) {
+        char *tmp = alloca(strlen(iommupath) + strlen(entry->d_name) + 32);
+        FILE *deviceclassfile;
+        unsigned deviceclass = 0, domainid, busid, devid, fnid;
+        char addr[32];
+        DeviceState *dev;
+
+        if (4 != sscanf(entry->d_name, "%X:%X:%X.%x",
+                        &domainid, &busid, &devid, &fnid)) {
+            continue;
+        }
+
+        sprintf(tmp, "%s%s/class", iommupath, entry->d_name);
+        printf("Reading device class from %s\n", tmp);
+
+        deviceclassfile = fopen(tmp, "r");
+        if (deviceclassfile) {
+            fscanf(deviceclassfile, "%x", &deviceclass);
+            fclose(deviceclassfile);
+        }
+        if (!deviceclass) {
+            continue;
+        }
+#define PCI_BASE_CLASS_BRIDGE           0x06
+        if ((phb->scan < 2) && ((deviceclass >> 16) == PCI_BASE_CLASS_BRIDGE)) {
+            continue;
+        }
+        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
+            /* Tweak USB */
+            phb->force_addr = 1;
+            phb->enable_multifunction = 1;
+        }
+
+        printf("Creating device %X:%X:%X.%x class=0x%X\n",
+               domainid, busid, devid, fnid, deviceclass);
+
+        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
+        if (!dev) {
+            fprintf(stderr, "failed to create vfio-pci\n");
+            continue;
+        }
+        qdev_prop_parse(dev, "host", entry->d_name);
+        if (phb->force_addr) {
+            sprintf(addr, "%X.%X", devid, fnid);
+            qdev_prop_parse(dev, "addr", addr);
+        }
+        if (phb->enable_multifunction) {
+            qdev_prop_set_bit(dev, "multifunction", 1);
+        }
+        qdev_init_nofail(dev);
+    }
+    closedir(dirp);
+
+    return 0;
+}
+
 static int spapr_phb_init(SysBusDevice *s)
 {
     sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
@@ -567,15 +652,13 @@  static int spapr_phb_init(SysBusDevice *s)
 
     bus = pci_register_bus(&phb->host_state.busdev.qdev,
                            phb->busname ? phb->busname : phb->dtbusname,
-                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
+                           pci_spapr_set_irq, pci_spapr_get_irq,
+                           pci_spapr_map_irq, phb,
                            &phb->memspace, &phb->iospace,
                            PCI_DEVFN(0, 0), PCI_NUM_PINS);
     phb->host_state.bus = bus;
 
     phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
-    phb->dma_window_start = 0;
-    phb->dma_window_size = 0x40000000;
-    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
     pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
 
     QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
@@ -588,6 +671,24 @@  static int spapr_phb_init(SysBusDevice *s)
         }
     }
 
+    if (phb->iommugroupid >= 0) {
+        if (0 > spapr_pci_scan_vfio(phb)) {
+            return -1;
+        }
+        if (!phb->iommugroup || !phb->iommugroup->container) {
+            return -1;
+        }
+        spapr_vfio_init_dma(phb->iommugroup->container->fd, phb->dma_liobn,
+                            &phb->dma_window_start,
+                            &phb->dma_window_size);
+        return 0;
+    }
+
+    phb->dma_window_start = 0;
+    phb->dma_window_size = 0x40000000;
+    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
+                                         phb->dma_window_size);
+
     return 0;
 }
 
@@ -599,6 +700,10 @@  static Property spapr_phb_properties[] = {
     DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
     DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
     DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
+    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
+    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1), /* 0 don't 1 +devices 2 +buses */
+    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
+    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
index 145071c..1953a74 100644
--- a/hw/spapr_pci.h
+++ b/hw/spapr_pci.h
@@ -57,6 +57,11 @@  typedef struct sPAPRPHBState {
         int nvec;
     } msi_table[SPAPR_MSIX_MAX_DEVS];
 
+    int32_t iommugroupid;
+    struct VFIOGroup *iommugroup;
+    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
+    uint8_t enable_multifunction, force_addr;
+
     QLIST_ENTRY(sPAPRPHBState) list;
 } sPAPRPHBState;
 
diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 1ac287f..73681fb 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -21,7 +21,6 @@ 
 #include <dirent.h>
 #include <stdio.h>
 #include <unistd.h>
-#include <sys/io.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/types.h>
@@ -43,6 +42,12 @@ 
 #include "range.h"
 #include "vfio_pci.h"
 #include "linux-vfio.h"
+#ifndef TARGET_PPC64
+#include <sys/io.h>
+#else
+#include "hw/pci_internals.h"
+#include "hw/spapr.h"
+#endif
 
 //#define DEBUG_VFIO
 #ifdef DEBUG_VFIO
@@ -1581,6 +1586,25 @@  static int vfio_connect_container(VFIOGroup *group)
 
         memory_listener_register(&container->listener, get_system_memory());
 
+#define POWERPC_IOMMU           2
+    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, POWERPC_IOMMU)) {
+        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
+        if (ret) {
+            error_report("vfio: failed to set group container: %s\n",
+                         strerror(errno));
+            g_free(container);
+            close(fd);
+            return -1;
+        }
+
+        ret = ioctl(fd, VFIO_SET_IOMMU, POWERPC_IOMMU);
+        if (ret) {
+            error_report("vfio: failed to set iommu for container: %s\n",
+                         strerror(errno));
+            g_free(container);
+            close(fd);
+            return -1;
+        }
     } else {
         error_report("vfio: No available IOMMU models\n");
         g_free(container);
@@ -1620,7 +1644,7 @@  static void vfio_disconnect_container(VFIOGroup *group)
     }
 }
 
-static VFIOGroup *vfio_get_group(int groupid)
+VFIOGroup *vfio_get_group(int groupid)
 {
     VFIOGroup *group;
     char path[32];
diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
index 226607c..d63dd63 100644
--- a/hw/vfio_pci.h
+++ b/hw/vfio_pci.h
@@ -105,4 +105,6 @@  typedef struct VFIOGroup {
 #define VFIO_FLAG_IOMMU_SHARED_BIT 0
 #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
 
+VFIOGroup *vfio_get_group(int groupid);
+
 #endif /* __VFIO_H__ */