Patchwork RFC: vfio-powerpc: added VFIO support (v3)

login
register
mail settings
Submitter Alexey Kardashevskiy
Date July 13, 2012, 7:26 a.m.
Message ID <1342164377-17907-1-git-send-email-aik@ozlabs.ru>
Download mbox | patch
Permalink /patch/170806/
State New
Headers show

Comments

Alexey Kardashevskiy - July 13, 2012, 7:26 a.m.
It literally does the following:

1. POWERPC IOMMU support (the kernel counterpart is required)

2. The patch assumes that IOAPIC calls are going to be replaced
with something generic.

3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU
handler to call VFIO IOMMU driver.

4. Change sPAPR PHB to scan the PCI bus which is used for
the IOMMU-VFIO group. Now it is enough to add the following to
the QEMU command line to get VFIO up with all the devices from
IOMMU group with id=3:
-device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 hw/ppc/Makefile.objs  |    3 ++
 hw/spapr.h            |    4 ++
 hw/spapr_iommu.c      |   69 ++++++++++++++++++++++++++++++-
 hw/spapr_iommu_vfio.h |   49 ++++++++++++++++++++++
 hw/spapr_pci.c        |  108 ++++++++++++++++++++++++++++++++++++++++++++++---
 hw/spapr_pci.h        |    4 ++
 hw/vfio_pci.c         |   30 ++++++++++++++
 hw/vfio_pci.h         |    2 +
 trace-events          |    1 +
 9 files changed, 264 insertions(+), 6 deletions(-)
 create mode 100644 hw/spapr_iommu_vfio.h
Blue Swirl - July 13, 2012, 2:38 p.m.
On Fri, Jul 13, 2012 at 7:26 AM, Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> It literally does the following:
>
> 1. POWERPC IOMMU support (the kernel counterpart is required)
>
> 2. The patch assumes that IOAPIC calls are going to be replaced
> with something generic.
>
> 3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU
> handler to call VFIO IOMMU driver.
>
> 4. Change sPAPR PHB to scan the PCI bus which is used for
> the IOMMU-VFIO group. Now it is enough to add the following to
> the QEMU command line to get VFIO up with all the devices from
> IOMMU group with id=3:
> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/ppc/Makefile.objs  |    3 ++
>  hw/spapr.h            |    4 ++
>  hw/spapr_iommu.c      |   69 ++++++++++++++++++++++++++++++-
>  hw/spapr_iommu_vfio.h |   49 ++++++++++++++++++++++
>  hw/spapr_pci.c        |  108 ++++++++++++++++++++++++++++++++++++++++++++++---
>  hw/spapr_pci.h        |    4 ++
>  hw/vfio_pci.c         |   30 ++++++++++++++
>  hw/vfio_pci.h         |    2 +
>  trace-events          |    1 +
>  9 files changed, 264 insertions(+), 6 deletions(-)
>  create mode 100644 hw/spapr_iommu_vfio.h
>
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index f573a95..c46a049 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>  # Xilinx PPC peripherals
>  obj-y += xilinx_ethlite.o
>
> +# VFIO PCI device assignment
> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> +
>  obj-y := $(addprefix ../,$(obj-y))
> diff --git a/hw/spapr.h b/hw/spapr.h
> index b37f337..26e26f6 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>                        DMAContext *dma);
>
> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size);
> +
>  #endif /* !defined (__HW_SPAPR_H__) */
> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> index 50c288d..e48ced1 100644
> --- a/hw/spapr_iommu.c
> +++ b/hw/spapr_iommu.c
> @@ -23,6 +23,8 @@
>  #include "dma.h"
>
>  #include "hw/spapr.h"
> +#include "hw/spapr_iommu_vfio.h"
> +#include "hw/vfio_pci.h"
>
>  #include <libfdt.h>
>
> @@ -183,6 +185,67 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>      return 0;
>  }
>
> +typedef struct sPAPRVFIOTable {
> +    int group_id;
> +    uint32_t liobn;
> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> +} sPAPRVFIOTable;
> +
> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> +
> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> +
> +    if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, &info)) {
> +        perror("SPAPR_TCE_IOMMU_GET_INFO failed");
> +        return;
> +    }
> +    *dma32_window_start = info.dma32_window_start;
> +    *dma32_window_size = info.dma32_window_size;
> +
> +    t = g_malloc0(sizeof(*t));

It looks like you initialize all fields, so plain g_malloc() can be used.

> +    t->group_id = group_id;
> +    t->liobn = liobn;
> +
> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> +}
> +
> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_dma_map map = {
> +        .argsz = sizeof(map),
> +        .va = 0,
> +        .dmaaddr = ioba,
> +    };
> +
> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> +        if (t->liobn != liobn) {
> +            continue;
> +        }
> +        if (tce) {
> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_MAP_DMA,
> +                                       &map)) {
> +                perror("TCE_MAP_DMA");
> +                return H_PARAMETER;
> +            }
> +        } else {
> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_UNMAP_DMA,
> +                                       &map)) {
> +                perror("TCE_UNMAP_DMA");
> +                return H_PARAMETER;
> +            }
> +        }
> +        return H_SUCCESS;
> +    }
> +    return H_CONTINUE; /* positive non-zero value */
> +}
> +
>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>                                target_ulong opcode, target_ulong *args)
>  {
> @@ -200,7 +263,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>      ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
>
>      ret = put_tce_emu(liobn, ioba, tce);
> -    if (0 >= ret) {
> +    if (ret <= 0) {
> +        return ret ? H_PARAMETER : H_SUCCESS;
> +    }
> +    ret = put_tce_vfio(liobn, ioba, tce);
> +    if (ret <= 0) {
>          return ret ? H_PARAMETER : H_SUCCESS;
>      }
>  #ifdef DEBUG_TCE
> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h
> new file mode 100644
> index 0000000..711e3e4
> --- /dev/null
> +++ b/hw/spapr_iommu_vfio.h
> @@ -0,0 +1,49 @@
> +/*
> + * Definitions for VFIO IOMMU driver implementing SPAPR TCE.
> + * This is the copy of the kernel header.

This file should be put somewhere into linux-headers directory to match kernel.

> + *
> + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__)
> +#define __HW_SPAPR_IOMMU_VFIO_H__
> +
> +#include "hw/linux-vfio.h"
> +
> +#define SPAPR_TCE_IOMMU         2
> +
> +struct tce_iommu_info {
> +    __u32 argsz;
> +    __u32 flags;
> +    __u32 dma32_window_start;
> +    __u32 dma32_window_size;
> +    __u64 dma64_window_start;
> +    __u64 dma64_window_size;
> +};
> +
> +#define SPAPR_TCE_IOMMU_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +struct tce_iommu_dma_map {
> +    __u32 argsz;
> +    __u32 flags;
> +    __u64 va;
> +    __u64 dmaaddr;
> +};
> +
> +#define SPAPR_TCE_IOMMU_MAP_DMA         _IO(VFIO_TYPE, VFIO_BASE + 13)
> +#define SPAPR_TCE_IOMMU_UNMAP_DMA       _IO(VFIO_TYPE, VFIO_BASE + 14)
> +
> +#endif
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 014297b..836ec4f 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -22,6 +22,9 @@
>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>   * THE SOFTWARE.
>   */
> +#include <sys/types.h>
> +#include <dirent.h>
> +
>  #include "hw.h"
>  #include "pci.h"
>  #include "msi.h"
> @@ -32,7 +35,6 @@
>  #include "exec-memory.h"
>  #include <libfdt.h>
>  #include "trace.h"
> -
>  #include "hw/pci_internals.h"
>
>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>                   level);
>  }
>
> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> +{
> +    sPAPRPHBState *phb = opaque;
> +    return phb->lsi_table[irq_num].dt_irq;
> +}
> +
>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>                                unsigned size)
>  {
> @@ -515,6 +523,79 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>      return phb->dma;
>  }
>
> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
> +{
> +    char iommupath[256];
> +    DIR *dirp;
> +    struct dirent *entry;
> +
> +    if (!phb->scan) {
> +        trace_spapr_pci("Autoscan disabled for ", phb->dtbusname);
> +        return 0;
> +    }
> +
> +    snprintf(iommupath, sizeof(iommupath),
> +             "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
> +    dirp = opendir(iommupath);
> +
> +    while ((entry = readdir(dirp)) != NULL) {
> +        char *tmp;
> +        FILE *deviceclassfile;
> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> +        char addr[32];
> +        DeviceState *dev;
> +
> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> +                   &domainid, &busid, &devid, &fnid) != 4) {
> +            continue;
> +        }
> +
> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> +        trace_spapr_pci("Reading device class from ", tmp);
> +
> +        deviceclassfile = fopen(tmp, "r");
> +        if (deviceclassfile) {
> +            fscanf(deviceclassfile, "%x", &deviceclass);
> +            fclose(deviceclassfile);
> +        }
> +        g_free(tmp);
> +
> +        if (!deviceclass) {
> +            continue;
> +        }
> +        if ((phb->scan < 2) &&
> +            ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) {
> +            /* Skip _any_ bridge */
> +            continue;
> +        }
> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
> +            /* Tweak USB */
> +            phb->force_addr = 1;
> +            phb->enable_multifunction = 1;
> +        }
> +
> +        trace_spapr_pci("Creating devicei from ", entry->d_name);
> +
> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
> +        if (!dev) {
> +            fprintf(stderr, "failed to create vfio-pci\n");
> +            continue;
> +        }
> +        qdev_prop_parse(dev, "host", entry->d_name);
> +        if (phb->force_addr) {
> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> +            qdev_prop_parse(dev, "addr", addr);
> +        }
> +        if (phb->enable_multifunction) {
> +            qdev_prop_set_bit(dev, "multifunction", 1);
> +        }
> +        qdev_init_nofail(dev);
> +    }
> +    closedir(dirp);
> +
> +    return 0;
> +}
> +
>  static int spapr_phb_init(SysBusDevice *s)
>  {
>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
> @@ -567,15 +648,13 @@ static int spapr_phb_init(SysBusDevice *s)
>
>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>                             phb->busname ? phb->busname : phb->dtbusname,
> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> +                           pci_spapr_map_irq, phb,
>                             &phb->memspace, &phb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>      phb->host_state.bus = bus;
>
>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
> -    phb->dma_window_start = 0;
> -    phb->dma_window_size = 0x40000000;
> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>
>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
> @@ -588,6 +667,21 @@ static int spapr_phb_init(SysBusDevice *s)
>          }
>      }
>
> +    if (phb->iommugroupid >= 0) {
> +        if (spapr_pci_scan_vfio(phb) < 0) {
> +            return -1;
> +        }
> +        spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn,
> +                            &phb->dma_window_start,
> +                            &phb->dma_window_size);
> +        return 0;
> +    }
> +
> +    phb->dma_window_start = 0;
> +    phb->dma_window_size = 0x40000000;
> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> +                                         phb->dma_window_size);
> +
>      return 0;
>  }
>
> @@ -599,6 +693,10 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1),
> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>
> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> index 145071c..f514823 100644
> --- a/hw/spapr_pci.h
> +++ b/hw/spapr_pci.h
> @@ -57,6 +57,10 @@ typedef struct sPAPRPHBState {
>          int nvec;
>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>
> +    int32_t iommugroupid;
> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
> +    uint8_t enable_multifunction, force_addr;
> +
>      QLIST_ENTRY(sPAPRPHBState) list;
>  } sPAPRPHBState;
>
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 1ac287f..fc84fb4 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -1581,6 +1581,24 @@ static int vfio_connect_container(VFIOGroup *group)
>
>          memory_listener_register(&container->listener, get_system_memory());
>
> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) {
> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> +        if (ret) {
> +            error_report("vfio: failed to set group container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
> +
> +        ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU);
> +        if (ret) {
> +            error_report("vfio: failed to set iommu for container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
>      } else {
>          error_report("vfio: No available IOMMU models\n");
>          g_free(container);
> @@ -2005,3 +2023,15 @@ static void register_vfio_pci_dev_type(void)
>  }
>
>  type_init(register_vfio_pci_dev_type)
> +
> +int vfio_group_iommu_ioctl(int iommu_group, int request, void *data)
> +{
> +    VFIOGroup *group;
> +
> +    group = vfio_get_group(iommu_group);
> +    if (!group->container) {
> +        return -EINVAL;
> +    }
> +
> +    return ioctl(group->container->fd, request, data);
> +}
> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> index 226607c..f44ff07 100644
> --- a/hw/vfio_pci.h
> +++ b/hw/vfio_pci.h
> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>
> +int vfio_group_iommu_ioctl(int iommu_group, int request, void *data);
> +
>  #endif /* __VFIO_H__ */
> diff --git a/trace-events b/trace-events
> index e548f86..9100591 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -848,6 +848,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride,
>  qxl_render_update_area_done(void *cookie) "%p"
>
>  # hw/spapr_pci.c
> +spapr_pci(const char *msg1, const char *msg2) "%s%s"
>  spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)"
>  spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64
>  spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u"
> --
> 1.7.10
>
Alex Williamson - July 13, 2012, 3:07 p.m.
On Fri, 2012-07-13 at 17:26 +1000, Alexey Kardashevskiy wrote:
> It literally does the following:
> 
> 1. POWERPC IOMMU support (the kernel counterpart is required)
> 
> 2. The patch assumes that IOAPIC calls are going to be replaced
> with something generic.
> 
> 3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU
> handler to call VFIO IOMMU driver.
> 
> 4. Change sPAPR PHB to scan the PCI bus which is used for
> the IOMMU-VFIO group. Now it is enough to add the following to
> the QEMU command line to get VFIO up with all the devices from
> IOMMU group with id=3:
> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  hw/ppc/Makefile.objs  |    3 ++
>  hw/spapr.h            |    4 ++
>  hw/spapr_iommu.c      |   69 ++++++++++++++++++++++++++++++-
>  hw/spapr_iommu_vfio.h |   49 ++++++++++++++++++++++
>  hw/spapr_pci.c        |  108 ++++++++++++++++++++++++++++++++++++++++++++++---
>  hw/spapr_pci.h        |    4 ++
>  hw/vfio_pci.c         |   30 ++++++++++++++
>  hw/vfio_pci.h         |    2 +
>  trace-events          |    1 +
>  9 files changed, 264 insertions(+), 6 deletions(-)
>  create mode 100644 hw/spapr_iommu_vfio.h
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index f573a95..c46a049 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>  # Xilinx PPC peripherals
>  obj-y += xilinx_ethlite.o
>  
> +# VFIO PCI device assignment
> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> +
>  obj-y := $(addprefix ../,$(obj-y))
> diff --git a/hw/spapr.h b/hw/spapr.h
> index b37f337..26e26f6 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>                        DMAContext *dma);
>  
> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size);
> +
>  #endif /* !defined (__HW_SPAPR_H__) */
> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> index 50c288d..e48ced1 100644
> --- a/hw/spapr_iommu.c
> +++ b/hw/spapr_iommu.c
> @@ -23,6 +23,8 @@
>  #include "dma.h"
>  
>  #include "hw/spapr.h"
> +#include "hw/spapr_iommu_vfio.h"
> +#include "hw/vfio_pci.h"
>  
>  #include <libfdt.h>
>  
> @@ -183,6 +185,67 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>      return 0;
>  }
>  
> +typedef struct sPAPRVFIOTable {
> +    int group_id;
> +    uint32_t liobn;
> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> +} sPAPRVFIOTable;
> +
> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> +
> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> +                         uint64_t *dma32_window_start,
> +                         uint64_t *dma32_window_size)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> +
> +    if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, &info)) {
> +        perror("SPAPR_TCE_IOMMU_GET_INFO failed");
> +        return;
> +    }
> +    *dma32_window_start = info.dma32_window_start;
> +    *dma32_window_size = info.dma32_window_size;
> +
> +    t = g_malloc0(sizeof(*t));
> +    t->group_id = group_id;
> +    t->liobn = liobn;
> +
> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> +}
> +
> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> +{
> +    sPAPRVFIOTable *t;
> +    struct tce_iommu_dma_map map = {
> +        .argsz = sizeof(map),
> +        .va = 0,
> +        .dmaaddr = ioba,
> +    };
> +
> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> +        if (t->liobn != liobn) {
> +            continue;
> +        }
> +        if (tce) {
> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_MAP_DMA,
> +                                       &map)) {
> +                perror("TCE_MAP_DMA");
> +                return H_PARAMETER;
> +            }
> +        } else {
> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_UNMAP_DMA,
> +                                       &map)) {
> +                perror("TCE_UNMAP_DMA");
> +                return H_PARAMETER;
> +            }
> +        }
> +        return H_SUCCESS;
> +    }
> +    return H_CONTINUE; /* positive non-zero value */
> +}
> +
>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>                                target_ulong opcode, target_ulong *args)
>  {
> @@ -200,7 +263,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>      ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
>  
>      ret = put_tce_emu(liobn, ioba, tce);
> -    if (0 >= ret) {
> +    if (ret <= 0) {
> +        return ret ? H_PARAMETER : H_SUCCESS;
> +    }
> +    ret = put_tce_vfio(liobn, ioba, tce);
> +    if (ret <= 0) {
>          return ret ? H_PARAMETER : H_SUCCESS;
>      }
>  #ifdef DEBUG_TCE
> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h
> new file mode 100644
> index 0000000..711e3e4
> --- /dev/null
> +++ b/hw/spapr_iommu_vfio.h
> @@ -0,0 +1,49 @@
> +/*
> + * Definitions for VFIO IOMMU driver implementing SPAPR TCE.
> + * This is the copy of the kernel header.
> + *
> + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__)
> +#define __HW_SPAPR_IOMMU_VFIO_H__
> +
> +#include "hw/linux-vfio.h"
> +
> +#define SPAPR_TCE_IOMMU         2
> +
> +struct tce_iommu_info {
> +    __u32 argsz;
> +    __u32 flags;
> +    __u32 dma32_window_start;
> +    __u32 dma32_window_size;
> +    __u64 dma64_window_start;
> +    __u64 dma64_window_size;
> +};
> +
> +#define SPAPR_TCE_IOMMU_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
> +
> +struct tce_iommu_dma_map {
> +    __u32 argsz;
> +    __u32 flags;
> +    __u64 va;
> +    __u64 dmaaddr;
> +};
> +
> +#define SPAPR_TCE_IOMMU_MAP_DMA         _IO(VFIO_TYPE, VFIO_BASE + 13)
> +#define SPAPR_TCE_IOMMU_UNMAP_DMA       _IO(VFIO_TYPE, VFIO_BASE + 14)
> +
> +#endif
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 014297b..836ec4f 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -22,6 +22,9 @@
>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>   * THE SOFTWARE.
>   */
> +#include <sys/types.h>
> +#include <dirent.h>
> +
>  #include "hw.h"
>  #include "pci.h"
>  #include "msi.h"
> @@ -32,7 +35,6 @@
>  #include "exec-memory.h"
>  #include <libfdt.h>
>  #include "trace.h"
> -
>  #include "hw/pci_internals.h"
>  
>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>                   level);
>  }
>  
> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> +{
> +    sPAPRPHBState *phb = opaque;
> +    return phb->lsi_table[irq_num].dt_irq;
> +}
> +
>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>                                unsigned size)
>  {
> @@ -515,6 +523,79 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>      return phb->dma;
>  }
>  
> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
> +{
> +    char iommupath[256];
> +    DIR *dirp;
> +    struct dirent *entry;
> +
> +    if (!phb->scan) {
> +        trace_spapr_pci("Autoscan disabled for ", phb->dtbusname);
> +        return 0;
> +    }
> +
> +    snprintf(iommupath, sizeof(iommupath),
> +             "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
> +    dirp = opendir(iommupath);
> +
> +    while ((entry = readdir(dirp)) != NULL) {
> +        char *tmp;
> +        FILE *deviceclassfile;
> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> +        char addr[32];
> +        DeviceState *dev;
> +
> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> +                   &domainid, &busid, &devid, &fnid) != 4) {
> +            continue;
> +        }
> +
> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> +        trace_spapr_pci("Reading device class from ", tmp);
> +
> +        deviceclassfile = fopen(tmp, "r");
> +        if (deviceclassfile) {
> +            fscanf(deviceclassfile, "%x", &deviceclass);
> +            fclose(deviceclassfile);
> +        }
> +        g_free(tmp);
> +
> +        if (!deviceclass) {
> +            continue;
> +        }
> +        if ((phb->scan < 2) &&
> +            ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) {
> +            /* Skip _any_ bridge */
> +            continue;
> +        }
> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
> +            /* Tweak USB */
> +            phb->force_addr = 1;
> +            phb->enable_multifunction = 1;
> +        }
> +
> +        trace_spapr_pci("Creating devicei from ", entry->d_name);
> +
> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
> +        if (!dev) {
> +            fprintf(stderr, "failed to create vfio-pci\n");
> +            continue;
> +        }
> +        qdev_prop_parse(dev, "host", entry->d_name);
> +        if (phb->force_addr) {
> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> +            qdev_prop_parse(dev, "addr", addr);
> +        }
> +        if (phb->enable_multifunction) {
> +            qdev_prop_set_bit(dev, "multifunction", 1);
> +        }
> +        qdev_init_nofail(dev);
> +    }
> +    closedir(dirp);
> +
> +    return 0;
> +}
> +
>  static int spapr_phb_init(SysBusDevice *s)
>  {
>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
> @@ -567,15 +648,13 @@ static int spapr_phb_init(SysBusDevice *s)
>  
>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>                             phb->busname ? phb->busname : phb->dtbusname,
> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> +                           pci_spapr_map_irq, phb,
>                             &phb->memspace, &phb->iospace,
>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>      phb->host_state.bus = bus;
>  
>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
> -    phb->dma_window_start = 0;
> -    phb->dma_window_size = 0x40000000;
> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>  
>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
> @@ -588,6 +667,21 @@ static int spapr_phb_init(SysBusDevice *s)
>          }
>      }
>  
> +    if (phb->iommugroupid >= 0) {
> +        if (spapr_pci_scan_vfio(phb) < 0) {
> +            return -1;
> +        }
> +        spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn,
> +                            &phb->dma_window_start,
> +                            &phb->dma_window_size);
> +        return 0;
> +    }
> +
> +    phb->dma_window_start = 0;
> +    phb->dma_window_size = 0x40000000;
> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> +                                         phb->dma_window_size);
> +
>      return 0;
>  }
>  
> @@ -599,6 +693,10 @@ static Property spapr_phb_properties[] = {
>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1),
> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> index 145071c..f514823 100644
> --- a/hw/spapr_pci.h
> +++ b/hw/spapr_pci.h
> @@ -57,6 +57,10 @@ typedef struct sPAPRPHBState {
>          int nvec;
>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>  
> +    int32_t iommugroupid;
> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
> +    uint8_t enable_multifunction, force_addr;
> +
>      QLIST_ENTRY(sPAPRPHBState) list;
>  } sPAPRPHBState;
>  
> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> index 1ac287f..fc84fb4 100644
> --- a/hw/vfio_pci.c
> +++ b/hw/vfio_pci.c
> @@ -1581,6 +1581,24 @@ static int vfio_connect_container(VFIOGroup *group)
>  
>          memory_listener_register(&container->listener, get_system_memory());
>  
> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) {
> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> +        if (ret) {
> +            error_report("vfio: failed to set group container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }
> +
> +        ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU);
> +        if (ret) {
> +            error_report("vfio: failed to set iommu for container: %s\n",
> +                         strerror(errno));
> +            g_free(container);
> +            close(fd);
> +            return -1;
> +        }

I think we can still do better.  The x86 code sets up a MemoryListener
here with data for that embedded into the VFIOContainer.  You don't
have, need, or want a MemoryListener, but that doesn't mean we can't
follow the model of registering that this group exists here and setting
up map/unmap callbacks.

For instance:

in vfio_pci.h:
struct sPAPRVFIOData {
    uint64_t dma32_window_start;
    uint64_t dma64_window_size;
    ....
    int (*map)(struct tce_iommu_dma_map *);
    int (*unmap)(struct tce_iommu_dma_map *);
};

appended to the above spapr tce iommu setup above:

struct tce_iommu_info info;

/* the MemoryListener embedded in container becomes a union to hold
 * iommu specific data. */
container->u.spapr.data->map = vfio_spapr_tce_map;
container->u.spapr.data->unmap = vfio_spapr_tce_unmap;

ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, &info))

container->u.spapr.data->dma32_window_start = info.dma32_window_start;
container->u.spapr.data->dma32_window_size = info.dma32_window_size;

spapr_register_vfio_container(&container->u.spapr.data)

Then vfio_disconnect_container() could call
spapr_unregister_vfio_container().  Maybe the container contains a
function pointer to an uninit function so we don't have to ifdef between
x86 and power.  Does that make sense?  Thanks,

Alex

>      } else {
>          error_report("vfio: No available IOMMU models\n");
>          g_free(container);
> @@ -2005,3 +2023,15 @@ static void register_vfio_pci_dev_type(void)
>  }
>  
>  type_init(register_vfio_pci_dev_type)
> +
> +int vfio_group_iommu_ioctl(int iommu_group, int request, void *data)
> +{
> +    VFIOGroup *group;
> +
> +    group = vfio_get_group(iommu_group);
> +    if (!group->container) {
> +        return -EINVAL;
> +    }
> +
> +    return ioctl(group->container->fd, request, data);
> +}
> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> index 226607c..f44ff07 100644
> --- a/hw/vfio_pci.h
> +++ b/hw/vfio_pci.h
> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>  
> +int vfio_group_iommu_ioctl(int iommu_group, int request, void *data);
> +
>  #endif /* __VFIO_H__ */
> diff --git a/trace-events b/trace-events
> index e548f86..9100591 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -848,6 +848,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride,
>  qxl_render_update_area_done(void *cookie) "%p"
>  
>  # hw/spapr_pci.c
> +spapr_pci(const char *msg1, const char *msg2) "%s%s"
>  spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)"
>  spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64
>  spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u"
Alexey Kardashevskiy - July 14, 2012, 2:34 a.m.
On 14/07/12 01:07, Alex Williamson wrote:
> On Fri, 2012-07-13 at 17:26 +1000, Alexey Kardashevskiy wrote:
>> It literally does the following:
>>
>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>
>> 2. The patch assumes that IOAPIC calls are going to be replaced
>> with something generic.
>>
>> 3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU
>> handler to call VFIO IOMMU driver.
>>
>> 4. Change sPAPR PHB to scan the PCI bus which is used for
>> the IOMMU-VFIO group. Now it is enough to add the following to
>> the QEMU command line to get VFIO up with all the devices from
>> IOMMU group with id=3:
>> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
>> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  hw/ppc/Makefile.objs  |    3 ++
>>  hw/spapr.h            |    4 ++
>>  hw/spapr_iommu.c      |   69 ++++++++++++++++++++++++++++++-
>>  hw/spapr_iommu_vfio.h |   49 ++++++++++++++++++++++
>>  hw/spapr_pci.c        |  108 ++++++++++++++++++++++++++++++++++++++++++++++---
>>  hw/spapr_pci.h        |    4 ++
>>  hw/vfio_pci.c         |   30 ++++++++++++++
>>  hw/vfio_pci.h         |    2 +
>>  trace-events          |    1 +
>>  9 files changed, 264 insertions(+), 6 deletions(-)
>>  create mode 100644 hw/spapr_iommu_vfio.h
>>
>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>> index f573a95..c46a049 100644
>> --- a/hw/ppc/Makefile.objs
>> +++ b/hw/ppc/Makefile.objs
>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>>  # Xilinx PPC peripherals
>>  obj-y += xilinx_ethlite.o
>>  
>> +# VFIO PCI device assignment
>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>> +
>>  obj-y := $(addprefix ../,$(obj-y))
>> diff --git a/hw/spapr.h b/hw/spapr.h
>> index b37f337..26e26f6 100644
>> --- a/hw/spapr.h
>> +++ b/hw/spapr.h
>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>                        DMAContext *dma);
>>  
>> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size);
>> +
>>  #endif /* !defined (__HW_SPAPR_H__) */
>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
>> index 50c288d..e48ced1 100644
>> --- a/hw/spapr_iommu.c
>> +++ b/hw/spapr_iommu.c
>> @@ -23,6 +23,8 @@
>>  #include "dma.h"
>>  
>>  #include "hw/spapr.h"
>> +#include "hw/spapr_iommu_vfio.h"
>> +#include "hw/vfio_pci.h"
>>  
>>  #include <libfdt.h>
>>  
>> @@ -183,6 +185,67 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>>      return 0;
>>  }
>>  
>> +typedef struct sPAPRVFIOTable {
>> +    int group_id;
>> +    uint32_t liobn;
>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
>> +} sPAPRVFIOTable;
>> +
>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
>> +
>> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
>> +                         uint64_t *dma32_window_start,
>> +                         uint64_t *dma32_window_size)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
>> +
>> +    if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, &info)) {
>> +        perror("SPAPR_TCE_IOMMU_GET_INFO failed");
>> +        return;
>> +    }
>> +    *dma32_window_start = info.dma32_window_start;
>> +    *dma32_window_size = info.dma32_window_size;
>> +
>> +    t = g_malloc0(sizeof(*t));
>> +    t->group_id = group_id;
>> +    t->liobn = liobn;
>> +
>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
>> +}
>> +
>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
>> +{
>> +    sPAPRVFIOTable *t;
>> +    struct tce_iommu_dma_map map = {
>> +        .argsz = sizeof(map),
>> +        .va = 0,
>> +        .dmaaddr = ioba,
>> +    };
>> +
>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
>> +        if (t->liobn != liobn) {
>> +            continue;
>> +        }
>> +        if (tce) {
>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
>> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_MAP_DMA,
>> +                                       &map)) {
>> +                perror("TCE_MAP_DMA");
>> +                return H_PARAMETER;
>> +            }
>> +        } else {
>> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_UNMAP_DMA,
>> +                                       &map)) {
>> +                perror("TCE_UNMAP_DMA");
>> +                return H_PARAMETER;
>> +            }
>> +        }
>> +        return H_SUCCESS;
>> +    }
>> +    return H_CONTINUE; /* positive non-zero value */
>> +}
>> +
>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>                                target_ulong opcode, target_ulong *args)
>>  {
>> @@ -200,7 +263,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>      ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
>>  
>>      ret = put_tce_emu(liobn, ioba, tce);
>> -    if (0 >= ret) {
>> +    if (ret <= 0) {
>> +        return ret ? H_PARAMETER : H_SUCCESS;
>> +    }
>> +    ret = put_tce_vfio(liobn, ioba, tce);
>> +    if (ret <= 0) {
>>          return ret ? H_PARAMETER : H_SUCCESS;
>>      }
>>  #ifdef DEBUG_TCE
>> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h
>> new file mode 100644
>> index 0000000..711e3e4
>> --- /dev/null
>> +++ b/hw/spapr_iommu_vfio.h
>> @@ -0,0 +1,49 @@
>> +/*
>> + * Definitions for VFIO IOMMU driver implementing SPAPR TCE.
>> + * This is the copy of the kernel header.
>> + *
>> + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru>
>> + *
>> + * This library is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2 of the License, or (at your option) any later version.
>> + *
>> + * This library is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__)
>> +#define __HW_SPAPR_IOMMU_VFIO_H__
>> +
>> +#include "hw/linux-vfio.h"
>> +
>> +#define SPAPR_TCE_IOMMU         2
>> +
>> +struct tce_iommu_info {
>> +    __u32 argsz;
>> +    __u32 flags;
>> +    __u32 dma32_window_start;
>> +    __u32 dma32_window_size;
>> +    __u64 dma64_window_start;
>> +    __u64 dma64_window_size;
>> +};
>> +
>> +#define SPAPR_TCE_IOMMU_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
>> +
>> +struct tce_iommu_dma_map {
>> +    __u32 argsz;
>> +    __u32 flags;
>> +    __u64 va;
>> +    __u64 dmaaddr;
>> +};
>> +
>> +#define SPAPR_TCE_IOMMU_MAP_DMA         _IO(VFIO_TYPE, VFIO_BASE + 13)
>> +#define SPAPR_TCE_IOMMU_UNMAP_DMA       _IO(VFIO_TYPE, VFIO_BASE + 14)
>> +
>> +#endif
>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>> index 014297b..836ec4f 100644
>> --- a/hw/spapr_pci.c
>> +++ b/hw/spapr_pci.c
>> @@ -22,6 +22,9 @@
>>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>>   * THE SOFTWARE.
>>   */
>> +#include <sys/types.h>
>> +#include <dirent.h>
>> +
>>  #include "hw.h"
>>  #include "pci.h"
>>  #include "msi.h"
>> @@ -32,7 +35,6 @@
>>  #include "exec-memory.h"
>>  #include <libfdt.h>
>>  #include "trace.h"
>> -
>>  #include "hw/pci_internals.h"
>>  
>>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
>> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>>                   level);
>>  }
>>  
>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
>> +{
>> +    sPAPRPHBState *phb = opaque;
>> +    return phb->lsi_table[irq_num].dt_irq;
>> +}
>> +
>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>>                                unsigned size)
>>  {
>> @@ -515,6 +523,79 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>>      return phb->dma;
>>  }
>>  
>> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
>> +{
>> +    char iommupath[256];
>> +    DIR *dirp;
>> +    struct dirent *entry;
>> +
>> +    if (!phb->scan) {
>> +        trace_spapr_pci("Autoscan disabled for ", phb->dtbusname);
>> +        return 0;
>> +    }
>> +
>> +    snprintf(iommupath, sizeof(iommupath),
>> +             "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
>> +    dirp = opendir(iommupath);
>> +
>> +    while ((entry = readdir(dirp)) != NULL) {
>> +        char *tmp;
>> +        FILE *deviceclassfile;
>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
>> +        char addr[32];
>> +        DeviceState *dev;
>> +
>> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
>> +                   &domainid, &busid, &devid, &fnid) != 4) {
>> +            continue;
>> +        }
>> +
>> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
>> +        trace_spapr_pci("Reading device class from ", tmp);
>> +
>> +        deviceclassfile = fopen(tmp, "r");
>> +        if (deviceclassfile) {
>> +            fscanf(deviceclassfile, "%x", &deviceclass);
>> +            fclose(deviceclassfile);
>> +        }
>> +        g_free(tmp);
>> +
>> +        if (!deviceclass) {
>> +            continue;
>> +        }
>> +        if ((phb->scan < 2) &&
>> +            ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) {
>> +            /* Skip _any_ bridge */
>> +            continue;
>> +        }
>> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
>> +            /* Tweak USB */
>> +            phb->force_addr = 1;
>> +            phb->enable_multifunction = 1;
>> +        }
>> +
>> +        trace_spapr_pci("Creating devicei from ", entry->d_name);
>> +
>> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
>> +        if (!dev) {
>> +            fprintf(stderr, "failed to create vfio-pci\n");
>> +            continue;
>> +        }
>> +        qdev_prop_parse(dev, "host", entry->d_name);
>> +        if (phb->force_addr) {
>> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
>> +            qdev_prop_parse(dev, "addr", addr);
>> +        }
>> +        if (phb->enable_multifunction) {
>> +            qdev_prop_set_bit(dev, "multifunction", 1);
>> +        }
>> +        qdev_init_nofail(dev);
>> +    }
>> +    closedir(dirp);
>> +
>> +    return 0;
>> +}
>> +
>>  static int spapr_phb_init(SysBusDevice *s)
>>  {
>>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
>> @@ -567,15 +648,13 @@ static int spapr_phb_init(SysBusDevice *s)
>>  
>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>>                             phb->busname ? phb->busname : phb->dtbusname,
>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
>> +                           pci_spapr_map_irq, phb,
>>                             &phb->memspace, &phb->iospace,
>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>>      phb->host_state.bus = bus;
>>  
>>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
>> -    phb->dma_window_start = 0;
>> -    phb->dma_window_size = 0x40000000;
>> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>>  
>>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
>> @@ -588,6 +667,21 @@ static int spapr_phb_init(SysBusDevice *s)
>>          }
>>      }
>>  
>> +    if (phb->iommugroupid >= 0) {
>> +        if (spapr_pci_scan_vfio(phb) < 0) {
>> +            return -1;
>> +        }
>> +        spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn,
>> +                            &phb->dma_window_start,
>> +                            &phb->dma_window_size);
>> +        return 0;
>> +    }
>> +
>> +    phb->dma_window_start = 0;
>> +    phb->dma_window_size = 0x40000000;
>> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
>> +                                         phb->dma_window_size);
>> +
>>      return 0;
>>  }
>>  
>> @@ -599,6 +693,10 @@ static Property spapr_phb_properties[] = {
>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1),
>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>  
>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
>> index 145071c..f514823 100644
>> --- a/hw/spapr_pci.h
>> +++ b/hw/spapr_pci.h
>> @@ -57,6 +57,10 @@ typedef struct sPAPRPHBState {
>>          int nvec;
>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>>  
>> +    int32_t iommugroupid;
>> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
>> +    uint8_t enable_multifunction, force_addr;
>> +
>>      QLIST_ENTRY(sPAPRPHBState) list;
>>  } sPAPRPHBState;
>>  
>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
>> index 1ac287f..fc84fb4 100644
>> --- a/hw/vfio_pci.c
>> +++ b/hw/vfio_pci.c
>> @@ -1581,6 +1581,24 @@ static int vfio_connect_container(VFIOGroup *group)
>>  
>>          memory_listener_register(&container->listener, get_system_memory());
>>  
>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) {
>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>> +        if (ret) {
>> +            error_report("vfio: failed to set group container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
>> +
>> +        ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU);
>> +        if (ret) {
>> +            error_report("vfio: failed to set iommu for container: %s\n",
>> +                         strerror(errno));
>> +            g_free(container);
>> +            close(fd);
>> +            return -1;
>> +        }
> 
> I think we can still do better.  The x86 code sets up a MemoryListener
> here with data for that embedded into the VFIOContainer.  You don't
> have, need, or want a MemoryListener, but that doesn't mean we can't
> follow the model of registering that this group exists here and setting
> up map/unmap callbacks.
> 
> For instance:
> 
> in vfio_pci.h:
> struct sPAPRVFIOData {
>     uint64_t dma32_window_start;
>     uint64_t dma64_window_size;
>     ....
>     int (*map)(struct tce_iommu_dma_map *);
>     int (*unmap)(struct tce_iommu_dma_map *);
> };
> 
> appended to the above spapr tce iommu setup above:
> 
> struct tce_iommu_info info;
> 
> /* the MemoryListener embedded in container becomes a union to hold
>  * iommu specific data. */
> container->u.spapr.data->map = vfio_spapr_tce_map;
> container->u.spapr.data->unmap = vfio_spapr_tce_unmap;

I could actually reuse x86 callbacks, just wanted to keep POWER ioctls together. The problem here is
getting the DMA window parameters and avoiding MemoryListener.

> ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, &info))did 
> 
> container->u.spapr.data->dma32_window_start = info.dma32_window_start;
> container->u.spapr.data->dma32_window_size = info.dma32_window_size;
> 
> spapr_register_vfio_container(&container->u.spapr.data)

I assume it is called within vfio_pci.c as we do not want to access VFIOContainer members from
anywhere but vfio_pci.c.
Or we are changing the approach? I am a bit confused.


> Then vfio_disconnect_container() could call
> spapr_unregister_vfio_container().  Maybe the container contains a
> function pointer to an uninit function so we don't have to ifdef between
> x86 and power.  Does that make sense?  Thanks,

We also need to pass the numbers from the info struct to spapr_pci.c in order to tell the guest
where the DMA window besides. Another callback? This exactly what I avoided in the kernel when we
decided not to extend IOMMU API with POWER stuff, I would like to have the same here.

In general, what is good in pulling to VFIO as much platform specific stuff as possible?

I am trying to keep sPAPR IOMMU stuff away and make it easy to add new platforms to VFIO.

For example, I would rather think of moving the piece of code which checks for SPAPR_TCE_IOMMU out
of VFIO, make it a QEMUMachine callback (together with add-eoi-notifier) as the way IOMMU works is
definitely the specific machine type feature.

For example, int QEMUMachine::init_iommu(VFIOContainter *container) which would not even try
VFIO_TYPE1_IOMMU on POWER or SPAPR_TCE_IOMMU on x86 as it knows the machine and IOMMU types already.


And do something like:
typedef struct VFIOContainer {
    int fd;
    void *platform_iommu_data;
} VFIOContainer;

Create additional file called vfio_iommu_x86.c with:
struct VFIO_Type1_IOMMU {
    MemoryListener listener;
    QLIST_HEAD(, VFIOGroup) group_list;
    QLIST_ENTRY(VFIOContainer) next;
};
and put all MemoryListener stuff there.

For POWER we already have spapr_iommu.c.

Wrong direction? :)


> Alex
> 
>>      } else {
>>          error_report("vfio: No available IOMMU models\n");
>>          g_free(container);
>> @@ -2005,3 +2023,15 @@ static void register_vfio_pci_dev_type(void)
>>  }
>>  
>>  type_init(register_vfio_pci_dev_type)
>> +
>> +int vfio_group_iommu_ioctl(int iommu_group, int request, void *data)
>> +{
>> +    VFIOGroup *group;
>> +
>> +    group = vfio_get_group(iommu_group);
>> +    if (!group->container) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    return ioctl(group->container->fd, request, data);
>> +}
>> diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
>> index 226607c..f44ff07 100644
>> --- a/hw/vfio_pci.h
>> +++ b/hw/vfio_pci.h
>> @@ -105,4 +105,6 @@ typedef struct VFIOGroup {
>>  #define VFIO_FLAG_IOMMU_SHARED_BIT 0
>>  #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
>>  
>> +int vfio_group_iommu_ioctl(int iommu_group, int request, void *data);
>> +
>>  #endif /* __VFIO_H__ */
>> diff --git a/trace-events b/trace-events
>> index e548f86..9100591 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -848,6 +848,7 @@ qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride,
>>  qxl_render_update_area_done(void *cookie) "%p"
>>  
>>  # hw/spapr_pci.c
>> +spapr_pci(const char *msg1, const char *msg2) "%s%s"
>>  spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)"
>>  spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64
>>  spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u"
> 
> 
>
Alex Williamson - July 16, 2012, 2:21 p.m.
On Sat, 2012-07-14 at 12:34 +1000, Alexey Kardashevskiy wrote:
> On 14/07/12 01:07, Alex Williamson wrote:
> > On Fri, 2012-07-13 at 17:26 +1000, Alexey Kardashevskiy wrote:
> >> It literally does the following:
> >>
> >> 1. POWERPC IOMMU support (the kernel counterpart is required)
> >>
> >> 2. The patch assumes that IOAPIC calls are going to be replaced
> >> with something generic.
> >>
> >> 3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU
> >> handler to call VFIO IOMMU driver.
> >>
> >> 4. Change sPAPR PHB to scan the PCI bus which is used for
> >> the IOMMU-VFIO group. Now it is enough to add the following to
> >> the QEMU command line to get VFIO up with all the devices from
> >> IOMMU group with id=3:
> >> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
> >> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>  hw/ppc/Makefile.objs  |    3 ++
> >>  hw/spapr.h            |    4 ++
> >>  hw/spapr_iommu.c      |   69 ++++++++++++++++++++++++++++++-
> >>  hw/spapr_iommu_vfio.h |   49 ++++++++++++++++++++++
> >>  hw/spapr_pci.c        |  108 ++++++++++++++++++++++++++++++++++++++++++++++---
> >>  hw/spapr_pci.h        |    4 ++
> >>  hw/vfio_pci.c         |   30 ++++++++++++++
> >>  hw/vfio_pci.h         |    2 +
> >>  trace-events          |    1 +
> >>  9 files changed, 264 insertions(+), 6 deletions(-)
> >>  create mode 100644 hw/spapr_iommu_vfio.h
> >>
> >> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >> index f573a95..c46a049 100644
> >> --- a/hw/ppc/Makefile.objs
> >> +++ b/hw/ppc/Makefile.objs
> >> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
> >>  # Xilinx PPC peripherals
> >>  obj-y += xilinx_ethlite.o
> >>  
> >> +# VFIO PCI device assignment
> >> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> >> +
> >>  obj-y := $(addprefix ../,$(obj-y))
> >> diff --git a/hw/spapr.h b/hw/spapr.h
> >> index b37f337..26e26f6 100644
> >> --- a/hw/spapr.h
> >> +++ b/hw/spapr.h
> >> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> >>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
> >>                        DMAContext *dma);
> >>  
> >> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> >> +                         uint64_t *dma32_window_start,
> >> +                         uint64_t *dma32_window_size);
> >> +
> >>  #endif /* !defined (__HW_SPAPR_H__) */
> >> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> >> index 50c288d..e48ced1 100644
> >> --- a/hw/spapr_iommu.c
> >> +++ b/hw/spapr_iommu.c
> >> @@ -23,6 +23,8 @@
> >>  #include "dma.h"
> >>  
> >>  #include "hw/spapr.h"
> >> +#include "hw/spapr_iommu_vfio.h"
> >> +#include "hw/vfio_pci.h"
> >>  
> >>  #include <libfdt.h>
> >>  
> >> @@ -183,6 +185,67 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
> >>      return 0;
> >>  }
> >>  
> >> +typedef struct sPAPRVFIOTable {
> >> +    int group_id;
> >> +    uint32_t liobn;
> >> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> >> +} sPAPRVFIOTable;
> >> +
> >> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> >> +
> >> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> >> +                         uint64_t *dma32_window_start,
> >> +                         uint64_t *dma32_window_size)
> >> +{
> >> +    sPAPRVFIOTable *t;
> >> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> >> +
> >> +    if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, &info)) {
> >> +        perror("SPAPR_TCE_IOMMU_GET_INFO failed");
> >> +        return;
> >> +    }
> >> +    *dma32_window_start = info.dma32_window_start;
> >> +    *dma32_window_size = info.dma32_window_size;
> >> +
> >> +    t = g_malloc0(sizeof(*t));
> >> +    t->group_id = group_id;
> >> +    t->liobn = liobn;
> >> +
> >> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> >> +}
> >> +
> >> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> >> +{
> >> +    sPAPRVFIOTable *t;
> >> +    struct tce_iommu_dma_map map = {
> >> +        .argsz = sizeof(map),
> >> +        .va = 0,
> >> +        .dmaaddr = ioba,
> >> +    };
> >> +
> >> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> >> +        if (t->liobn != liobn) {
> >> +            continue;
> >> +        }
> >> +        if (tce) {
> >> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> >> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_MAP_DMA,
> >> +                                       &map)) {
> >> +                perror("TCE_MAP_DMA");
> >> +                return H_PARAMETER;
> >> +            }
> >> +        } else {
> >> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_UNMAP_DMA,
> >> +                                       &map)) {
> >> +                perror("TCE_UNMAP_DMA");
> >> +                return H_PARAMETER;
> >> +            }
> >> +        }
> >> +        return H_SUCCESS;
> >> +    }
> >> +    return H_CONTINUE; /* positive non-zero value */
> >> +}
> >> +
> >>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
> >>                                target_ulong opcode, target_ulong *args)
> >>  {
> >> @@ -200,7 +263,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
> >>      ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> >>  
> >>      ret = put_tce_emu(liobn, ioba, tce);
> >> -    if (0 >= ret) {
> >> +    if (ret <= 0) {
> >> +        return ret ? H_PARAMETER : H_SUCCESS;
> >> +    }
> >> +    ret = put_tce_vfio(liobn, ioba, tce);
> >> +    if (ret <= 0) {
> >>          return ret ? H_PARAMETER : H_SUCCESS;
> >>      }
> >>  #ifdef DEBUG_TCE
> >> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h
> >> new file mode 100644
> >> index 0000000..711e3e4
> >> --- /dev/null
> >> +++ b/hw/spapr_iommu_vfio.h
> >> @@ -0,0 +1,49 @@
> >> +/*
> >> + * Definitions for VFIO IOMMU driver implementing SPAPR TCE.
> >> + * This is the copy of the kernel header.
> >> + *
> >> + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru>
> >> + *
> >> + * This library is free software; you can redistribute it and/or
> >> + * modify it under the terms of the GNU Lesser General Public
> >> + * License as published by the Free Software Foundation; either
> >> + * version 2 of the License, or (at your option) any later version.
> >> + *
> >> + * This library is distributed in the hope that it will be useful,
> >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> + * Lesser General Public License for more details.
> >> + *
> >> + * You should have received a copy of the GNU Lesser General Public
> >> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> >> + */
> >> +
> >> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__)
> >> +#define __HW_SPAPR_IOMMU_VFIO_H__
> >> +
> >> +#include "hw/linux-vfio.h"
> >> +
> >> +#define SPAPR_TCE_IOMMU         2
> >> +
> >> +struct tce_iommu_info {
> >> +    __u32 argsz;
> >> +    __u32 flags;
> >> +    __u32 dma32_window_start;
> >> +    __u32 dma32_window_size;
> >> +    __u64 dma64_window_start;
> >> +    __u64 dma64_window_size;
> >> +};
> >> +
> >> +#define SPAPR_TCE_IOMMU_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
> >> +
> >> +struct tce_iommu_dma_map {
> >> +    __u32 argsz;
> >> +    __u32 flags;
> >> +    __u64 va;
> >> +    __u64 dmaaddr;
> >> +};
> >> +
> >> +#define SPAPR_TCE_IOMMU_MAP_DMA         _IO(VFIO_TYPE, VFIO_BASE + 13)
> >> +#define SPAPR_TCE_IOMMU_UNMAP_DMA       _IO(VFIO_TYPE, VFIO_BASE + 14)
> >> +
> >> +#endif
> >> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> >> index 014297b..836ec4f 100644
> >> --- a/hw/spapr_pci.c
> >> +++ b/hw/spapr_pci.c
> >> @@ -22,6 +22,9 @@
> >>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >>   * THE SOFTWARE.
> >>   */
> >> +#include <sys/types.h>
> >> +#include <dirent.h>
> >> +
> >>  #include "hw.h"
> >>  #include "pci.h"
> >>  #include "msi.h"
> >> @@ -32,7 +35,6 @@
> >>  #include "exec-memory.h"
> >>  #include <libfdt.h>
> >>  #include "trace.h"
> >> -
> >>  #include "hw/pci_internals.h"
> >>  
> >>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> >> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
> >>                   level);
> >>  }
> >>  
> >> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> >> +{
> >> +    sPAPRPHBState *phb = opaque;
> >> +    return phb->lsi_table[irq_num].dt_irq;
> >> +}
> >> +
> >>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
> >>                                unsigned size)
> >>  {
> >> @@ -515,6 +523,79 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
> >>      return phb->dma;
> >>  }
> >>  
> >> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
> >> +{
> >> +    char iommupath[256];
> >> +    DIR *dirp;
> >> +    struct dirent *entry;
> >> +
> >> +    if (!phb->scan) {
> >> +        trace_spapr_pci("Autoscan disabled for ", phb->dtbusname);
> >> +        return 0;
> >> +    }
> >> +
> >> +    snprintf(iommupath, sizeof(iommupath),
> >> +             "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
> >> +    dirp = opendir(iommupath);
> >> +
> >> +    while ((entry = readdir(dirp)) != NULL) {
> >> +        char *tmp;
> >> +        FILE *deviceclassfile;
> >> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> >> +        char addr[32];
> >> +        DeviceState *dev;
> >> +
> >> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> >> +                   &domainid, &busid, &devid, &fnid) != 4) {
> >> +            continue;
> >> +        }
> >> +
> >> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> >> +        trace_spapr_pci("Reading device class from ", tmp);
> >> +
> >> +        deviceclassfile = fopen(tmp, "r");
> >> +        if (deviceclassfile) {
> >> +            fscanf(deviceclassfile, "%x", &deviceclass);
> >> +            fclose(deviceclassfile);
> >> +        }
> >> +        g_free(tmp);
> >> +
> >> +        if (!deviceclass) {
> >> +            continue;
> >> +        }
> >> +        if ((phb->scan < 2) &&
> >> +            ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) {
> >> +            /* Skip _any_ bridge */
> >> +            continue;
> >> +        }
> >> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
> >> +            /* Tweak USB */
> >> +            phb->force_addr = 1;
> >> +            phb->enable_multifunction = 1;
> >> +        }
> >> +
> >> +        trace_spapr_pci("Creating devicei from ", entry->d_name);
> >> +
> >> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
> >> +        if (!dev) {
> >> +            fprintf(stderr, "failed to create vfio-pci\n");
> >> +            continue;
> >> +        }
> >> +        qdev_prop_parse(dev, "host", entry->d_name);
> >> +        if (phb->force_addr) {
> >> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> >> +            qdev_prop_parse(dev, "addr", addr);
> >> +        }
> >> +        if (phb->enable_multifunction) {
> >> +            qdev_prop_set_bit(dev, "multifunction", 1);
> >> +        }
> >> +        qdev_init_nofail(dev);
> >> +    }
> >> +    closedir(dirp);
> >> +
> >> +    return 0;
> >> +}
> >> +
> >>  static int spapr_phb_init(SysBusDevice *s)
> >>  {
> >>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
> >> @@ -567,15 +648,13 @@ static int spapr_phb_init(SysBusDevice *s)
> >>  
> >>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
> >>                             phb->busname ? phb->busname : phb->dtbusname,
> >> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> >> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> >> +                           pci_spapr_map_irq, phb,
> >>                             &phb->memspace, &phb->iospace,
> >>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
> >>      phb->host_state.bus = bus;
> >>  
> >>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
> >> -    phb->dma_window_start = 0;
> >> -    phb->dma_window_size = 0x40000000;
> >> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
> >>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
> >>  
> >>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
> >> @@ -588,6 +667,21 @@ static int spapr_phb_init(SysBusDevice *s)
> >>          }
> >>      }
> >>  
> >> +    if (phb->iommugroupid >= 0) {
> >> +        if (spapr_pci_scan_vfio(phb) < 0) {
> >> +            return -1;
> >> +        }
> >> +        spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn,
> >> +                            &phb->dma_window_start,
> >> +                            &phb->dma_window_size);
> >> +        return 0;
> >> +    }
> >> +
> >> +    phb->dma_window_start = 0;
> >> +    phb->dma_window_size = 0x40000000;
> >> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> >> +                                         phb->dma_window_size);
> >> +
> >>      return 0;
> >>  }
> >>  
> >> @@ -599,6 +693,10 @@ static Property spapr_phb_properties[] = {
> >>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
> >>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
> >>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> >> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
> >> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1),
> >> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
> >> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
> >>      DEFINE_PROP_END_OF_LIST(),
> >>  };
> >>  
> >> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> >> index 145071c..f514823 100644
> >> --- a/hw/spapr_pci.h
> >> +++ b/hw/spapr_pci.h
> >> @@ -57,6 +57,10 @@ typedef struct sPAPRPHBState {
> >>          int nvec;
> >>      } msi_table[SPAPR_MSIX_MAX_DEVS];
> >>  
> >> +    int32_t iommugroupid;
> >> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
> >> +    uint8_t enable_multifunction, force_addr;
> >> +
> >>      QLIST_ENTRY(sPAPRPHBState) list;
> >>  } sPAPRPHBState;
> >>  
> >> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> >> index 1ac287f..fc84fb4 100644
> >> --- a/hw/vfio_pci.c
> >> +++ b/hw/vfio_pci.c
> >> @@ -1581,6 +1581,24 @@ static int vfio_connect_container(VFIOGroup *group)
> >>  
> >>          memory_listener_register(&container->listener, get_system_memory());
> >>  
> >> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) {
> >> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> >> +        if (ret) {
> >> +            error_report("vfio: failed to set group container: %s\n",
> >> +                         strerror(errno));
> >> +            g_free(container);
> >> +            close(fd);
> >> +            return -1;
> >> +        }
> >> +
> >> +        ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU);
> >> +        if (ret) {
> >> +            error_report("vfio: failed to set iommu for container: %s\n",
> >> +                         strerror(errno));
> >> +            g_free(container);
> >> +            close(fd);
> >> +            return -1;
> >> +        }
> > 
> > I think we can still do better.  The x86 code sets up a MemoryListener
> > here with data for that embedded into the VFIOContainer.  You don't
> > have, need, or want a MemoryListener, but that doesn't mean we can't
> > follow the model of registering that this group exists here and setting
> > up map/unmap callbacks.
> > 
> > For instance:
> > 
> > in vfio_pci.h:
> > struct sPAPRVFIOData {
> >     uint64_t dma32_window_start;
> >     uint64_t dma64_window_size;
> >     ....
> >     int (*map)(struct tce_iommu_dma_map *);
> >     int (*unmap)(struct tce_iommu_dma_map *);
> > };
> > 
> > appended to the above spapr tce iommu setup above:
> > 
> > struct tce_iommu_info info;
> > 
> > /* the MemoryListener embedded in container becomes a union to hold
> >  * iommu specific data. */
> > container->u.spapr.data->map = vfio_spapr_tce_map;
> > container->u.spapr.data->unmap = vfio_spapr_tce_unmap;
> 
> I could actually reuse x86 callbacks, just wanted to keep POWER ioctls together. The problem here is
> getting the DMA window parameters and avoiding MemoryListener.

The callback is pretty trivial though, filling in the data structure and
calling the ioctl.  We have different data structures and different
ioctls, so probably not a lot to leverage.

> > ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, &info))did 
> > 
> > container->u.spapr.data->dma32_window_start = info.dma32_window_start;
> > container->u.spapr.data->dma32_window_size = info.dma32_window_size;
> > 
> > spapr_register_vfio_container(&container->u.spapr.data)
> 
> I assume it is called within vfio_pci.c as we do not want to access VFIOContainer members from
> anywhere but vfio_pci.c.
> Or we are changing the approach? I am a bit confused.

Yes, the registration function would be called from vfio_pci at the
equivalent place in the spar iommu test as x86 is setting up the memory
listener.  That would register the map/unmap function pointers and dma
window information.  spapr would then make map/unmap calls using those
function pointers, those would be implemented in vfio_pci where they
could dereference the container and therefore get to the container fd.

> > Then vfio_disconnect_container() could call
> > spapr_unregister_vfio_container().  Maybe the container contains a
> > function pointer to an uninit function so we don't have to ifdef between
> > x86 and power.  Does that make sense?  Thanks,
> 
> We also need to pass the numbers from the info struct to spapr_pci.c in order to tell the guest
> where the DMA window besides. Another callback? This exactly what I avoided in the kernel when we
> decided not to extend IOMMU API with POWER stuff, I would like to have the same here.

This in an internal API, there's no penalty for fixing it later.
callbacks and window info are passed in the data structure outlined
above.

> In general, what is good in pulling to VFIO as much platform specific stuff as possible?
> 
> I am trying to keep sPAPR IOMMU stuff away and make it easy to add new platforms to VFIO.
> 
> For example, I would rather think of moving the piece of code which checks for SPAPR_TCE_IOMMU out
> of VFIO, make it a QEMUMachine callback (together with add-eoi-notifier) as the way IOMMU works is
> definitely the specific machine type feature.
> 
> For example, int QEMUMachine::init_iommu(VFIOContainter *container) which would not even try
> VFIO_TYPE1_IOMMU on POWER or SPAPR_TCE_IOMMU on x86 as it knows the machine and IOMMU types already.

I don't think tying vfio into the QEMUMachine type has a future.  If you
convince Anthony or Michael otherwise, let me know.  Your attempt to
keep spapr stuff completely out of vfio is requiring private interfaces
to be exposed.  That I think is the wrong direction.

> And do something like:
> typedef struct VFIOContainer {
>     int fd;
>     void *platform_iommu_data;
> } VFIOContainer;
> 
> Create additional file called vfio_iommu_x86.c with:
> struct VFIO_Type1_IOMMU {
>     MemoryListener listener;
>     QLIST_HEAD(, VFIOGroup) group_list;
>     QLIST_ENTRY(VFIOContainer) next;
> };
> and put all MemoryListener stuff there.
> 
> For POWER we already have spapr_iommu.c.
> 
> Wrong direction? :)

This is actually very similar to what I'm proposing above.  Rather than
a platform_iommu_data pointer, I'm suggesting that we make a union in
VFIOContainer.  That allows us to actually dereference the container and
get back to the fd in the callback functions.  Thanks,

Alex
Alex Williamson - July 16, 2012, 9:17 p.m.
On Mon, 2012-07-16 at 08:21 -0600, Alex Williamson wrote:
> On Sat, 2012-07-14 at 12:34 +1000, Alexey Kardashevskiy wrote:
> > On 14/07/12 01:07, Alex Williamson wrote:
> > > On Fri, 2012-07-13 at 17:26 +1000, Alexey Kardashevskiy wrote:
> > >> It literally does the following:
> > >>
> > >> 1. POWERPC IOMMU support (the kernel counterpart is required)
> > >>
> > >> 2. The patch assumes that IOAPIC calls are going to be replaced
> > >> with something generic.
> > >>
> > >> 3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU
> > >> handler to call VFIO IOMMU driver.
> > >>
> > >> 4. Change sPAPR PHB to scan the PCI bus which is used for
> > >> the IOMMU-VFIO group. Now it is enough to add the following to
> > >> the QEMU command line to get VFIO up with all the devices from
> > >> IOMMU group with id=3:
> > >> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
> > >> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
> > >>
> > >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > >> ---
> > >>  hw/ppc/Makefile.objs  |    3 ++
> > >>  hw/spapr.h            |    4 ++
> > >>  hw/spapr_iommu.c      |   69 ++++++++++++++++++++++++++++++-
> > >>  hw/spapr_iommu_vfio.h |   49 ++++++++++++++++++++++
> > >>  hw/spapr_pci.c        |  108 ++++++++++++++++++++++++++++++++++++++++++++++---
> > >>  hw/spapr_pci.h        |    4 ++
> > >>  hw/vfio_pci.c         |   30 ++++++++++++++
> > >>  hw/vfio_pci.h         |    2 +
> > >>  trace-events          |    1 +
> > >>  9 files changed, 264 insertions(+), 6 deletions(-)
> > >>  create mode 100644 hw/spapr_iommu_vfio.h
> > >>
> > >> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> > >> index f573a95..c46a049 100644
> > >> --- a/hw/ppc/Makefile.objs
> > >> +++ b/hw/ppc/Makefile.objs
> > >> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
> > >>  # Xilinx PPC peripherals
> > >>  obj-y += xilinx_ethlite.o
> > >>  
> > >> +# VFIO PCI device assignment
> > >> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> > >> +
> > >>  obj-y := $(addprefix ../,$(obj-y))
> > >> diff --git a/hw/spapr.h b/hw/spapr.h
> > >> index b37f337..26e26f6 100644
> > >> --- a/hw/spapr.h
> > >> +++ b/hw/spapr.h
> > >> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> > >>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
> > >>                        DMAContext *dma);
> > >>  
> > >> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> > >> +                         uint64_t *dma32_window_start,
> > >> +                         uint64_t *dma32_window_size);
> > >> +
> > >>  #endif /* !defined (__HW_SPAPR_H__) */
> > >> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> > >> index 50c288d..e48ced1 100644
> > >> --- a/hw/spapr_iommu.c
> > >> +++ b/hw/spapr_iommu.c
> > >> @@ -23,6 +23,8 @@
> > >>  #include "dma.h"
> > >>  
> > >>  #include "hw/spapr.h"
> > >> +#include "hw/spapr_iommu_vfio.h"
> > >> +#include "hw/vfio_pci.h"
> > >>  
> > >>  #include <libfdt.h>
> > >>  
> > >> @@ -183,6 +185,67 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
> > >>      return 0;
> > >>  }
> > >>  
> > >> +typedef struct sPAPRVFIOTable {
> > >> +    int group_id;
> > >> +    uint32_t liobn;
> > >> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> > >> +} sPAPRVFIOTable;
> > >> +
> > >> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> > >> +
> > >> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> > >> +                         uint64_t *dma32_window_start,
> > >> +                         uint64_t *dma32_window_size)
> > >> +{
> > >> +    sPAPRVFIOTable *t;
> > >> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> > >> +
> > >> +    if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, &info)) {
> > >> +        perror("SPAPR_TCE_IOMMU_GET_INFO failed");
> > >> +        return;
> > >> +    }
> > >> +    *dma32_window_start = info.dma32_window_start;
> > >> +    *dma32_window_size = info.dma32_window_size;
> > >> +
> > >> +    t = g_malloc0(sizeof(*t));
> > >> +    t->group_id = group_id;
> > >> +    t->liobn = liobn;
> > >> +
> > >> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> > >> +}
> > >> +
> > >> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> > >> +{
> > >> +    sPAPRVFIOTable *t;
> > >> +    struct tce_iommu_dma_map map = {
> > >> +        .argsz = sizeof(map),
> > >> +        .va = 0,
> > >> +        .dmaaddr = ioba,
> > >> +    };
> > >> +
> > >> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> > >> +        if (t->liobn != liobn) {
> > >> +            continue;
> > >> +        }
> > >> +        if (tce) {
> > >> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> > >> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_MAP_DMA,
> > >> +                                       &map)) {
> > >> +                perror("TCE_MAP_DMA");
> > >> +                return H_PARAMETER;
> > >> +            }
> > >> +        } else {
> > >> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_UNMAP_DMA,
> > >> +                                       &map)) {
> > >> +                perror("TCE_UNMAP_DMA");
> > >> +                return H_PARAMETER;
> > >> +            }
> > >> +        }
> > >> +        return H_SUCCESS;
> > >> +    }
> > >> +    return H_CONTINUE; /* positive non-zero value */
> > >> +}
> > >> +
> > >>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
> > >>                                target_ulong opcode, target_ulong *args)
> > >>  {
> > >> @@ -200,7 +263,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
> > >>      ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> > >>  
> > >>      ret = put_tce_emu(liobn, ioba, tce);
> > >> -    if (0 >= ret) {
> > >> +    if (ret <= 0) {
> > >> +        return ret ? H_PARAMETER : H_SUCCESS;
> > >> +    }
> > >> +    ret = put_tce_vfio(liobn, ioba, tce);
> > >> +    if (ret <= 0) {
> > >>          return ret ? H_PARAMETER : H_SUCCESS;
> > >>      }
> > >>  #ifdef DEBUG_TCE
> > >> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h
> > >> new file mode 100644
> > >> index 0000000..711e3e4
> > >> --- /dev/null
> > >> +++ b/hw/spapr_iommu_vfio.h
> > >> @@ -0,0 +1,49 @@
> > >> +/*
> > >> + * Definitions for VFIO IOMMU driver implementing SPAPR TCE.
> > >> + * This is the copy of the kernel header.
> > >> + *
> > >> + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru>
> > >> + *
> > >> + * This library is free software; you can redistribute it and/or
> > >> + * modify it under the terms of the GNU Lesser General Public
> > >> + * License as published by the Free Software Foundation; either
> > >> + * version 2 of the License, or (at your option) any later version.
> > >> + *
> > >> + * This library is distributed in the hope that it will be useful,
> > >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > >> + * Lesser General Public License for more details.
> > >> + *
> > >> + * You should have received a copy of the GNU Lesser General Public
> > >> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> > >> + */
> > >> +
> > >> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__)
> > >> +#define __HW_SPAPR_IOMMU_VFIO_H__
> > >> +
> > >> +#include "hw/linux-vfio.h"
> > >> +
> > >> +#define SPAPR_TCE_IOMMU         2
> > >> +
> > >> +struct tce_iommu_info {
> > >> +    __u32 argsz;
> > >> +    __u32 flags;
> > >> +    __u32 dma32_window_start;
> > >> +    __u32 dma32_window_size;
> > >> +    __u64 dma64_window_start;
> > >> +    __u64 dma64_window_size;
> > >> +};
> > >> +
> > >> +#define SPAPR_TCE_IOMMU_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
> > >> +
> > >> +struct tce_iommu_dma_map {
> > >> +    __u32 argsz;
> > >> +    __u32 flags;
> > >> +    __u64 va;
> > >> +    __u64 dmaaddr;
> > >> +};
> > >> +
> > >> +#define SPAPR_TCE_IOMMU_MAP_DMA         _IO(VFIO_TYPE, VFIO_BASE + 13)
> > >> +#define SPAPR_TCE_IOMMU_UNMAP_DMA       _IO(VFIO_TYPE, VFIO_BASE + 14)
> > >> +
> > >> +#endif
> > >> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> > >> index 014297b..836ec4f 100644
> > >> --- a/hw/spapr_pci.c
> > >> +++ b/hw/spapr_pci.c
> > >> @@ -22,6 +22,9 @@
> > >>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> > >>   * THE SOFTWARE.
> > >>   */
> > >> +#include <sys/types.h>
> > >> +#include <dirent.h>
> > >> +
> > >>  #include "hw.h"
> > >>  #include "pci.h"
> > >>  #include "msi.h"
> > >> @@ -32,7 +35,6 @@
> > >>  #include "exec-memory.h"
> > >>  #include <libfdt.h>
> > >>  #include "trace.h"
> > >> -
> > >>  #include "hw/pci_internals.h"
> > >>  
> > >>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> > >> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
> > >>                   level);
> > >>  }
> > >>  
> > >> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> > >> +{
> > >> +    sPAPRPHBState *phb = opaque;
> > >> +    return phb->lsi_table[irq_num].dt_irq;
> > >> +}
> > >> +
> > >>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
> > >>                                unsigned size)
> > >>  {
> > >> @@ -515,6 +523,79 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
> > >>      return phb->dma;
> > >>  }
> > >>  
> > >> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
> > >> +{
> > >> +    char iommupath[256];
> > >> +    DIR *dirp;
> > >> +    struct dirent *entry;
> > >> +
> > >> +    if (!phb->scan) {
> > >> +        trace_spapr_pci("Autoscan disabled for ", phb->dtbusname);
> > >> +        return 0;
> > >> +    }
> > >> +
> > >> +    snprintf(iommupath, sizeof(iommupath),
> > >> +             "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
> > >> +    dirp = opendir(iommupath);
> > >> +
> > >> +    while ((entry = readdir(dirp)) != NULL) {
> > >> +        char *tmp;
> > >> +        FILE *deviceclassfile;
> > >> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> > >> +        char addr[32];
> > >> +        DeviceState *dev;
> > >> +
> > >> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> > >> +                   &domainid, &busid, &devid, &fnid) != 4) {
> > >> +            continue;
> > >> +        }
> > >> +
> > >> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> > >> +        trace_spapr_pci("Reading device class from ", tmp);
> > >> +
> > >> +        deviceclassfile = fopen(tmp, "r");
> > >> +        if (deviceclassfile) {
> > >> +            fscanf(deviceclassfile, "%x", &deviceclass);
> > >> +            fclose(deviceclassfile);
> > >> +        }
> > >> +        g_free(tmp);
> > >> +
> > >> +        if (!deviceclass) {
> > >> +            continue;
> > >> +        }
> > >> +        if ((phb->scan < 2) &&
> > >> +            ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) {
> > >> +            /* Skip _any_ bridge */
> > >> +            continue;
> > >> +        }
> > >> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
> > >> +            /* Tweak USB */
> > >> +            phb->force_addr = 1;
> > >> +            phb->enable_multifunction = 1;
> > >> +        }
> > >> +
> > >> +        trace_spapr_pci("Creating devicei from ", entry->d_name);
> > >> +
> > >> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
> > >> +        if (!dev) {
> > >> +            fprintf(stderr, "failed to create vfio-pci\n");
> > >> +            continue;
> > >> +        }
> > >> +        qdev_prop_parse(dev, "host", entry->d_name);
> > >> +        if (phb->force_addr) {
> > >> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> > >> +            qdev_prop_parse(dev, "addr", addr);
> > >> +        }
> > >> +        if (phb->enable_multifunction) {
> > >> +            qdev_prop_set_bit(dev, "multifunction", 1);
> > >> +        }
> > >> +        qdev_init_nofail(dev);
> > >> +    }
> > >> +    closedir(dirp);
> > >> +
> > >> +    return 0;
> > >> +}
> > >> +
> > >>  static int spapr_phb_init(SysBusDevice *s)
> > >>  {
> > >>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
> > >> @@ -567,15 +648,13 @@ static int spapr_phb_init(SysBusDevice *s)
> > >>  
> > >>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
> > >>                             phb->busname ? phb->busname : phb->dtbusname,
> > >> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> > >> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> > >> +                           pci_spapr_map_irq, phb,
> > >>                             &phb->memspace, &phb->iospace,
> > >>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
> > >>      phb->host_state.bus = bus;
> > >>  
> > >>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
> > >> -    phb->dma_window_start = 0;
> > >> -    phb->dma_window_size = 0x40000000;
> > >> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
> > >>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
> > >>  
> > >>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
> > >> @@ -588,6 +667,21 @@ static int spapr_phb_init(SysBusDevice *s)
> > >>          }
> > >>      }
> > >>  
> > >> +    if (phb->iommugroupid >= 0) {
> > >> +        if (spapr_pci_scan_vfio(phb) < 0) {
> > >> +            return -1;
> > >> +        }
> > >> +        spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn,
> > >> +                            &phb->dma_window_start,
> > >> +                            &phb->dma_window_size);
> > >> +        return 0;
> > >> +    }
> > >> +
> > >> +    phb->dma_window_start = 0;
> > >> +    phb->dma_window_size = 0x40000000;
> > >> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> > >> +                                         phb->dma_window_size);
> > >> +
> > >>      return 0;
> > >>  }
> > >>  
> > >> @@ -599,6 +693,10 @@ static Property spapr_phb_properties[] = {
> > >>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
> > >>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
> > >>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> > >> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
> > >> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1),
> > >> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
> > >> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
> > >>      DEFINE_PROP_END_OF_LIST(),
> > >>  };
> > >>  
> > >> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> > >> index 145071c..f514823 100644
> > >> --- a/hw/spapr_pci.h
> > >> +++ b/hw/spapr_pci.h
> > >> @@ -57,6 +57,10 @@ typedef struct sPAPRPHBState {
> > >>          int nvec;
> > >>      } msi_table[SPAPR_MSIX_MAX_DEVS];
> > >>  
> > >> +    int32_t iommugroupid;
> > >> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
> > >> +    uint8_t enable_multifunction, force_addr;
> > >> +
> > >>      QLIST_ENTRY(sPAPRPHBState) list;
> > >>  } sPAPRPHBState;
> > >>  
> > >> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> > >> index 1ac287f..fc84fb4 100644
> > >> --- a/hw/vfio_pci.c
> > >> +++ b/hw/vfio_pci.c
> > >> @@ -1581,6 +1581,24 @@ static int vfio_connect_container(VFIOGroup *group)
> > >>  
> > >>          memory_listener_register(&container->listener, get_system_memory());
> > >>  
> > >> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) {
> > >> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> > >> +        if (ret) {
> > >> +            error_report("vfio: failed to set group container: %s\n",
> > >> +                         strerror(errno));
> > >> +            g_free(container);
> > >> +            close(fd);
> > >> +            return -1;
> > >> +        }
> > >> +
> > >> +        ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU);
> > >> +        if (ret) {
> > >> +            error_report("vfio: failed to set iommu for container: %s\n",
> > >> +                         strerror(errno));
> > >> +            g_free(container);
> > >> +            close(fd);
> > >> +            return -1;
> > >> +        }
> > > 
> > > I think we can still do better.  The x86 code sets up a MemoryListener
> > > here with data for that embedded into the VFIOContainer.  You don't
> > > have, need, or want a MemoryListener, but that doesn't mean we can't
> > > follow the model of registering that this group exists here and setting
> > > up map/unmap callbacks.
> > > 
> > > For instance:
> > > 
> > > in vfio_pci.h:
> > > struct sPAPRVFIOData {
> > >     uint64_t dma32_window_start;
> > >     uint64_t dma64_window_size;
> > >     ....
> > >     int (*map)(struct tce_iommu_dma_map *);
> > >     int (*unmap)(struct tce_iommu_dma_map *);
> > > };
> > > 
> > > appended to the above spapr tce iommu setup above:
> > > 
> > > struct tce_iommu_info info;
> > > 
> > > /* the MemoryListener embedded in container becomes a union to hold
> > >  * iommu specific data. */
> > > container->u.spapr.data->map = vfio_spapr_tce_map;
> > > container->u.spapr.data->unmap = vfio_spapr_tce_unmap;
> > 
> > I could actually reuse x86 callbacks, just wanted to keep POWER ioctls together. The problem here is
> > getting the DMA window parameters and avoiding MemoryListener.
> 
> The callback is pretty trivial though, filling in the data structure and
> calling the ioctl.  We have different data structures and different
> ioctls, so probably not a lot to leverage.
> 
> > > ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, &info))did 
> > > 
> > > container->u.spapr.data->dma32_window_start = info.dma32_window_start;
> > > container->u.spapr.data->dma32_window_size = info.dma32_window_size;
> > > 
> > > spapr_register_vfio_container(&container->u.spapr.data)
> > 
> > I assume it is called within vfio_pci.c as we do not want to access VFIOContainer members from
> > anywhere but vfio_pci.c.
> > Or we are changing the approach? I am a bit confused.
> 
> Yes, the registration function would be called from vfio_pci at the
> equivalent place in the spar iommu test as x86 is setting up the memory
> listener.  That would register the map/unmap function pointers and dma
> window information.  spapr would then make map/unmap calls using those
> function pointers, those would be implemented in vfio_pci where they
> could dereference the container and therefore get to the container fd.
> 
> > > Then vfio_disconnect_container() could call
> > > spapr_unregister_vfio_container().  Maybe the container contains a
> > > function pointer to an uninit function so we don't have to ifdef between
> > > x86 and power.  Does that make sense?  Thanks,
> > 
> > We also need to pass the numbers from the info struct to spapr_pci.c in order to tell the guest
> > where the DMA window besides. Another callback? This exactly what I avoided in the kernel when we
> > decided not to extend IOMMU API with POWER stuff, I would like to have the same here.
> 
> This in an internal API, there's no penalty for fixing it later.
> callbacks and window info are passed in the data structure outlined
> above.
> 
> > In general, what is good in pulling to VFIO as much platform specific stuff as possible?
> > 
> > I am trying to keep sPAPR IOMMU stuff away and make it easy to add new platforms to VFIO.
> > 
> > For example, I would rather think of moving the piece of code which checks for SPAPR_TCE_IOMMU out
> > of VFIO, make it a QEMUMachine callback (together with add-eoi-notifier) as the way IOMMU works is
> > definitely the specific machine type feature.
> > 
> > For example, int QEMUMachine::init_iommu(VFIOContainter *container) which would not even try
> > VFIO_TYPE1_IOMMU on POWER or SPAPR_TCE_IOMMU on x86 as it knows the machine and IOMMU types already.
> 
> I don't think tying vfio into the QEMUMachine type has a future.  If you
> convince Anthony or Michael otherwise, let me know.  Your attempt to
> keep spapr stuff completely out of vfio is requiring private interfaces
> to be exposed.  That I think is the wrong direction.
> 
> > And do something like:
> > typedef struct VFIOContainer {
> >     int fd;
> >     void *platform_iommu_data;
> > } VFIOContainer;
> > 
> > Create additional file called vfio_iommu_x86.c with:
> > struct VFIO_Type1_IOMMU {
> >     MemoryListener listener;
> >     QLIST_HEAD(, VFIOGroup) group_list;
> >     QLIST_ENTRY(VFIOContainer) next;
> > };
> > and put all MemoryListener stuff there.
> > 
> > For POWER we already have spapr_iommu.c.
> > 
> > Wrong direction? :)
> 
> This is actually very similar to what I'm proposing above.  Rather than
> a platform_iommu_data pointer, I'm suggesting that we make a union in
> VFIOContainer.  That allows us to actually dereference the container and
> get back to the fd in the callback functions.  Thanks,

Here's a stab at generalizing the way it works on x86.  The commit
comment outlines how I think it should be extended.
https://github.com/awilliam/qemu-vfio/commit/96e6f8446fc10a8f9d0506df8230218799de31ae
Thanks,
Alex
Alexey Kardashevskiy - July 17, 2012, 7:53 a.m.
On 17/07/12 00:21, Alex Williamson wrote:
> On Sat, 2012-07-14 at 12:34 +1000, Alexey Kardashevskiy wrote:
>> On 14/07/12 01:07, Alex Williamson wrote:
>>> On Fri, 2012-07-13 at 17:26 +1000, Alexey Kardashevskiy wrote:
>>>> It literally does the following:
>>>>
>>>> 1. POWERPC IOMMU support (the kernel counterpart is required)
>>>>
>>>> 2. The patch assumes that IOAPIC calls are going to be replaced
>>>> with something generic.
>>>>
>>>> 3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU
>>>> handler to call VFIO IOMMU driver.
>>>>
>>>> 4. Change sPAPR PHB to scan the PCI bus which is used for
>>>> the IOMMU-VFIO group. Now it is enough to add the following to
>>>> the QEMU command line to get VFIO up with all the devices from
>>>> IOMMU group with id=3:
>>>> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
>>>> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>  hw/ppc/Makefile.objs  |    3 ++
>>>>  hw/spapr.h            |    4 ++
>>>>  hw/spapr_iommu.c      |   69 ++++++++++++++++++++++++++++++-
>>>>  hw/spapr_iommu_vfio.h |   49 ++++++++++++++++++++++
>>>>  hw/spapr_pci.c        |  108 ++++++++++++++++++++++++++++++++++++++++++++++---
>>>>  hw/spapr_pci.h        |    4 ++
>>>>  hw/vfio_pci.c         |   30 ++++++++++++++
>>>>  hw/vfio_pci.h         |    2 +
>>>>  trace-events          |    1 +
>>>>  9 files changed, 264 insertions(+), 6 deletions(-)
>>>>  create mode 100644 hw/spapr_iommu_vfio.h
>>>>
>>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>>>> index f573a95..c46a049 100644
>>>> --- a/hw/ppc/Makefile.objs
>>>> +++ b/hw/ppc/Makefile.objs
>>>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
>>>>  # Xilinx PPC peripherals
>>>>  obj-y += xilinx_ethlite.o
>>>>  
>>>> +# VFIO PCI device assignment
>>>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
>>>> +
>>>>  obj-y := $(addprefix ../,$(obj-y))
>>>> diff --git a/hw/spapr.h b/hw/spapr.h
>>>> index b37f337..26e26f6 100644
>>>> --- a/hw/spapr.h
>>>> +++ b/hw/spapr.h
>>>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
>>>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>>>                        DMAContext *dma);
>>>>  
>>>> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
>>>> +                         uint64_t *dma32_window_start,
>>>> +                         uint64_t *dma32_window_size);
>>>> +
>>>>  #endif /* !defined (__HW_SPAPR_H__) */
>>>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
>>>> index 50c288d..e48ced1 100644
>>>> --- a/hw/spapr_iommu.c
>>>> +++ b/hw/spapr_iommu.c
>>>> @@ -23,6 +23,8 @@
>>>>  #include "dma.h"
>>>>  
>>>>  #include "hw/spapr.h"
>>>> +#include "hw/spapr_iommu_vfio.h"
>>>> +#include "hw/vfio_pci.h"
>>>>  
>>>>  #include <libfdt.h>
>>>>  
>>>> @@ -183,6 +185,67 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
>>>>      return 0;
>>>>  }
>>>>  
>>>> +typedef struct sPAPRVFIOTable {
>>>> +    int group_id;
>>>> +    uint32_t liobn;
>>>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
>>>> +} sPAPRVFIOTable;
>>>> +
>>>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
>>>> +
>>>> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
>>>> +                         uint64_t *dma32_window_start,
>>>> +                         uint64_t *dma32_window_size)
>>>> +{
>>>> +    sPAPRVFIOTable *t;
>>>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
>>>> +
>>>> +    if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, &info)) {
>>>> +        perror("SPAPR_TCE_IOMMU_GET_INFO failed");
>>>> +        return;
>>>> +    }
>>>> +    *dma32_window_start = info.dma32_window_start;
>>>> +    *dma32_window_size = info.dma32_window_size;
>>>> +
>>>> +    t = g_malloc0(sizeof(*t));
>>>> +    t->group_id = group_id;
>>>> +    t->liobn = liobn;
>>>> +
>>>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
>>>> +}
>>>> +
>>>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
>>>> +{
>>>> +    sPAPRVFIOTable *t;
>>>> +    struct tce_iommu_dma_map map = {
>>>> +        .argsz = sizeof(map),
>>>> +        .va = 0,
>>>> +        .dmaaddr = ioba,
>>>> +    };
>>>> +
>>>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
>>>> +        if (t->liobn != liobn) {
>>>> +            continue;
>>>> +        }
>>>> +        if (tce) {
>>>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
>>>> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_MAP_DMA,
>>>> +                                       &map)) {
>>>> +                perror("TCE_MAP_DMA");
>>>> +                return H_PARAMETER;
>>>> +            }
>>>> +        } else {
>>>> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_UNMAP_DMA,
>>>> +                                       &map)) {
>>>> +                perror("TCE_UNMAP_DMA");
>>>> +                return H_PARAMETER;
>>>> +            }
>>>> +        }
>>>> +        return H_SUCCESS;
>>>> +    }
>>>> +    return H_CONTINUE; /* positive non-zero value */
>>>> +}
>>>> +
>>>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>>>                                target_ulong opcode, target_ulong *args)
>>>>  {
>>>> @@ -200,7 +263,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
>>>>      ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
>>>>  
>>>>      ret = put_tce_emu(liobn, ioba, tce);
>>>> -    if (0 >= ret) {
>>>> +    if (ret <= 0) {
>>>> +        return ret ? H_PARAMETER : H_SUCCESS;
>>>> +    }
>>>> +    ret = put_tce_vfio(liobn, ioba, tce);
>>>> +    if (ret <= 0) {
>>>>          return ret ? H_PARAMETER : H_SUCCESS;
>>>>      }
>>>>  #ifdef DEBUG_TCE
>>>> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h
>>>> new file mode 100644
>>>> index 0000000..711e3e4
>>>> --- /dev/null
>>>> +++ b/hw/spapr_iommu_vfio.h
>>>> @@ -0,0 +1,49 @@
>>>> +/*
>>>> + * Definitions for VFIO IOMMU driver implementing SPAPR TCE.
>>>> + * This is the copy of the kernel header.
>>>> + *
>>>> + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru>
>>>> + *
>>>> + * This library is free software; you can redistribute it and/or
>>>> + * modify it under the terms of the GNU Lesser General Public
>>>> + * License as published by the Free Software Foundation; either
>>>> + * version 2 of the License, or (at your option) any later version.
>>>> + *
>>>> + * This library is distributed in the hope that it will be useful,
>>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>>> + * Lesser General Public License for more details.
>>>> + *
>>>> + * You should have received a copy of the GNU Lesser General Public
>>>> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
>>>> + */
>>>> +
>>>> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__)
>>>> +#define __HW_SPAPR_IOMMU_VFIO_H__
>>>> +
>>>> +#include "hw/linux-vfio.h"
>>>> +
>>>> +#define SPAPR_TCE_IOMMU         2
>>>> +
>>>> +struct tce_iommu_info {
>>>> +    __u32 argsz;
>>>> +    __u32 flags;
>>>> +    __u32 dma32_window_start;
>>>> +    __u32 dma32_window_size;
>>>> +    __u64 dma64_window_start;
>>>> +    __u64 dma64_window_size;
>>>> +};
>>>> +
>>>> +#define SPAPR_TCE_IOMMU_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
>>>> +
>>>> +struct tce_iommu_dma_map {
>>>> +    __u32 argsz;
>>>> +    __u32 flags;
>>>> +    __u64 va;
>>>> +    __u64 dmaaddr;
>>>> +};
>>>> +
>>>> +#define SPAPR_TCE_IOMMU_MAP_DMA         _IO(VFIO_TYPE, VFIO_BASE + 13)
>>>> +#define SPAPR_TCE_IOMMU_UNMAP_DMA       _IO(VFIO_TYPE, VFIO_BASE + 14)
>>>> +
>>>> +#endif
>>>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>>>> index 014297b..836ec4f 100644
>>>> --- a/hw/spapr_pci.c
>>>> +++ b/hw/spapr_pci.c
>>>> @@ -22,6 +22,9 @@
>>>>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>>>>   * THE SOFTWARE.
>>>>   */
>>>> +#include <sys/types.h>
>>>> +#include <dirent.h>
>>>> +
>>>>  #include "hw.h"
>>>>  #include "pci.h"
>>>>  #include "msi.h"
>>>> @@ -32,7 +35,6 @@
>>>>  #include "exec-memory.h"
>>>>  #include <libfdt.h>
>>>>  #include "trace.h"
>>>> -
>>>>  #include "hw/pci_internals.h"
>>>>  
>>>>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
>>>> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
>>>>                   level);
>>>>  }
>>>>  
>>>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
>>>> +{
>>>> +    sPAPRPHBState *phb = opaque;
>>>> +    return phb->lsi_table[irq_num].dt_irq;
>>>> +}
>>>> +
>>>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
>>>>                                unsigned size)
>>>>  {
>>>> @@ -515,6 +523,79 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>>>>      return phb->dma;
>>>>  }
>>>>  
>>>> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
>>>> +{
>>>> +    char iommupath[256];
>>>> +    DIR *dirp;
>>>> +    struct dirent *entry;
>>>> +
>>>> +    if (!phb->scan) {
>>>> +        trace_spapr_pci("Autoscan disabled for ", phb->dtbusname);
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    snprintf(iommupath, sizeof(iommupath),
>>>> +             "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
>>>> +    dirp = opendir(iommupath);
>>>> +
>>>> +    while ((entry = readdir(dirp)) != NULL) {
>>>> +        char *tmp;
>>>> +        FILE *deviceclassfile;
>>>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
>>>> +        char addr[32];
>>>> +        DeviceState *dev;
>>>> +
>>>> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
>>>> +                   &domainid, &busid, &devid, &fnid) != 4) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
>>>> +        trace_spapr_pci("Reading device class from ", tmp);
>>>> +
>>>> +        deviceclassfile = fopen(tmp, "r");
>>>> +        if (deviceclassfile) {
>>>> +            fscanf(deviceclassfile, "%x", &deviceclass);
>>>> +            fclose(deviceclassfile);
>>>> +        }
>>>> +        g_free(tmp);
>>>> +
>>>> +        if (!deviceclass) {
>>>> +            continue;
>>>> +        }
>>>> +        if ((phb->scan < 2) &&
>>>> +            ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) {
>>>> +            /* Skip _any_ bridge */
>>>> +            continue;
>>>> +        }
>>>> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
>>>> +            /* Tweak USB */
>>>> +            phb->force_addr = 1;
>>>> +            phb->enable_multifunction = 1;
>>>> +        }
>>>> +
>>>> +        trace_spapr_pci("Creating devicei from ", entry->d_name);
>>>> +
>>>> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
>>>> +        if (!dev) {
>>>> +            fprintf(stderr, "failed to create vfio-pci\n");
>>>> +            continue;
>>>> +        }
>>>> +        qdev_prop_parse(dev, "host", entry->d_name);
>>>> +        if (phb->force_addr) {
>>>> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
>>>> +            qdev_prop_parse(dev, "addr", addr);
>>>> +        }
>>>> +        if (phb->enable_multifunction) {
>>>> +            qdev_prop_set_bit(dev, "multifunction", 1);
>>>> +        }
>>>> +        qdev_init_nofail(dev);
>>>> +    }
>>>> +    closedir(dirp);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>>  static int spapr_phb_init(SysBusDevice *s)
>>>>  {
>>>>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
>>>> @@ -567,15 +648,13 @@ static int spapr_phb_init(SysBusDevice *s)
>>>>  
>>>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
>>>>                             phb->busname ? phb->busname : phb->dtbusname,
>>>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
>>>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
>>>> +                           pci_spapr_map_irq, phb,
>>>>                             &phb->memspace, &phb->iospace,
>>>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
>>>>      phb->host_state.bus = bus;
>>>>  
>>>>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
>>>> -    phb->dma_window_start = 0;
>>>> -    phb->dma_window_size = 0x40000000;
>>>> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
>>>>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
>>>>  
>>>>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
>>>> @@ -588,6 +667,21 @@ static int spapr_phb_init(SysBusDevice *s)
>>>>          }
>>>>      }
>>>>  
>>>> +    if (phb->iommugroupid >= 0) {
>>>> +        if (spapr_pci_scan_vfio(phb) < 0) {
>>>> +            return -1;
>>>> +        }
>>>> +        spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn,
>>>> +                            &phb->dma_window_start,
>>>> +                            &phb->dma_window_size);
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    phb->dma_window_start = 0;
>>>> +    phb->dma_window_size = 0x40000000;
>>>> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
>>>> +                                         phb->dma_window_size);
>>>> +
>>>>      return 0;
>>>>  }
>>>>  
>>>> @@ -599,6 +693,10 @@ static Property spapr_phb_properties[] = {
>>>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
>>>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
>>>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
>>>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
>>>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1),
>>>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
>>>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
>>>>      DEFINE_PROP_END_OF_LIST(),
>>>>  };
>>>>  
>>>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
>>>> index 145071c..f514823 100644
>>>> --- a/hw/spapr_pci.h
>>>> +++ b/hw/spapr_pci.h
>>>> @@ -57,6 +57,10 @@ typedef struct sPAPRPHBState {
>>>>          int nvec;
>>>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
>>>>  
>>>> +    int32_t iommugroupid;
>>>> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
>>>> +    uint8_t enable_multifunction, force_addr;
>>>> +
>>>>      QLIST_ENTRY(sPAPRPHBState) list;
>>>>  } sPAPRPHBState;
>>>>  
>>>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
>>>> index 1ac287f..fc84fb4 100644
>>>> --- a/hw/vfio_pci.c
>>>> +++ b/hw/vfio_pci.c
>>>> @@ -1581,6 +1581,24 @@ static int vfio_connect_container(VFIOGroup *group)
>>>>  
>>>>          memory_listener_register(&container->listener, get_system_memory());
>>>>  
>>>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) {
>>>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
>>>> +        if (ret) {
>>>> +            error_report("vfio: failed to set group container: %s\n",
>>>> +                         strerror(errno));
>>>> +            g_free(container);
>>>> +            close(fd);
>>>> +            return -1;
>>>> +        }
>>>> +
>>>> +        ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU);
>>>> +        if (ret) {
>>>> +            error_report("vfio: failed to set iommu for container: %s\n",
>>>> +                         strerror(errno));
>>>> +            g_free(container);
>>>> +            close(fd);
>>>> +            return -1;
>>>> +        }
>>>
>>> I think we can still do better.  The x86 code sets up a MemoryListener
>>> here with data for that embedded into the VFIOContainer.  You don't
>>> have, need, or want a MemoryListener, but that doesn't mean we can't
>>> follow the model of registering that this group exists here and setting
>>> up map/unmap callbacks.
>>>
>>> For instance:
>>>
>>> in vfio_pci.h:
>>> struct sPAPRVFIOData {
>>>     uint64_t dma32_window_start;
>>>     uint64_t dma64_window_size;
>>>     ....
>>>     int (*map)(struct tce_iommu_dma_map *);
>>>     int (*unmap)(struct tce_iommu_dma_map *);
>>> };

>>> appended to the above spapr tce iommu setup above:
>>>
>>> struct tce_iommu_info info;
>>>
>>> /* the MemoryListener embedded in container becomes a union to hold
>>>  * iommu specific data. */
>>> container->u.spapr.data->map = vfio_spapr_tce_map;
>>> container->u.spapr.data->unmap = vfio_spapr_tce_unmap;
>>
>> I could actually reuse x86 callbacks, just wanted to keep POWER ioctls together. The problem here is
>> getting the DMA window parameters and avoiding MemoryListener.
> 
> The callback is pretty trivial though, filling in the data structure and
> calling the ioctl.  We have different data structures and different
> ioctls, so probably not a lot to leverage.
> 
>>> ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, &info))did 
>>>
>>> container->u.spapr.data->dma32_window_start = info.dma32_window_start;
>>> container->u.spapr.data->dma32_window_size = info.dma32_window_size;
>>>
>>> spapr_register_vfio_container(&container->u.spapr.data)
>>
>> I assume it is called within vfio_pci.c as we do not want to access VFIOContainer members from
>> anywhere but vfio_pci.c.
>> Or we are changing the approach? I am a bit confused.
> 
> Yes, the registration function would be called from vfio_pci at the
> equivalent place in the spar iommu test as x86 is setting up the memory
> listener.  That would register the map/unmap function pointers and dma
> window information.  spapr would then make map/unmap calls using those
> function pointers, those would be implemented in vfio_pci where they
> could dereference the container and therefore get to the container fd.


How do we match this data with the PCI bus or device?

Even if we add IOMMU ID to sPAPRVFIOData and map/unmap callbacks, we still do not know which PCI bus it corresponds to if we create devices as it is done on x86. sPAPR PHB does not have an IOMMU id/fd and cannot get it from VFIO as it would be "exposing a private interface".

So I will have to specify an IOMMU id for the PCI bus from the command line.

And I really want to be sure that spapr_register_vfio_container() is called before spapr_pci.c started populating the device tree with the DMA window parameters which is done in spapr_reset() (a reset function of sPAPR PHB) now. Ideally I would like to know where the window is even before in order to initialize DMAContext, so keeping everything in one function spapr_phb_init() seems very right for me.


>>> Then vfio_disconnect_container() could call
>>> spapr_unregister_vfio_container().  Maybe the container contains a
>>> function pointer to an uninit function so we don't have to ifdef between
>>> x86 and power.  Does that make sense?  Thanks,
>>
>> We also need to pass the numbers from the info struct to spapr_pci.c in order to tell the guest
>> where the DMA window besides. Another callback? This exactly what I avoided in the kernel when we
>> decided not to extend IOMMU API with POWER stuff, I would like to have the same here.
> 
> This in an internal API, there's no penalty for fixing it later.
> callbacks and window info are passed in the data structure outlined
> above.
> 
>> In general, what is good in pulling to VFIO as much platform specific stuff as possible?
>>
>> I am trying to keep sPAPR IOMMU stuff away and make it easy to add new platforms to VFIO.
>>
>> For example, I would rather think of moving the piece of code which checks for SPAPR_TCE_IOMMU out
>> of VFIO, make it a QEMUMachine callback (together with add-eoi-notifier) as the way IOMMU works is
>> definitely the specific machine type feature.
>>
>> For example, int QEMUMachine::init_iommu(VFIOContainter *container) which would not even try
>> VFIO_TYPE1_IOMMU on POWER or SPAPR_TCE_IOMMU on x86 as it knows the machine and IOMMU types already.
> 
> I don't think tying vfio into the QEMUMachine type has a future.  If you
> convince Anthony or Michael otherwise, let me know.  


This is mostly because it has "vfio" in its name.
If it was something generic like IOMMU-via-fd with no mention of VFIO, then it would have got a chance :)
Seriously, it already has a set of various unrelated flags, I do not see why not to add something what really belongs to it.


> Your attempt to
> keep spapr stuff completely out of vfio is requiring private interfaces
> to be exposed.  That I think is the wrong direction.


int vfio_group_iommu_ioctl(int iommu_group, int request, void *data)
does not expose anything private from VFIO. IOMMU id is not any kind of private data. And struct tce_iommu_info is not a data which VFIO really wants to know. The calling code (spapr_pci.c) should know the IOMMU id either way and only it knows how map/unmap works, lets keep it there.

I would only add to VFIO API this:
int vfio_group_iommu_init(int iommu_group)
to make a group initialization explicit.

What is wrong with such a solution?


>> And do something like:
>> typedef struct VFIOContainer {
>>     int fd;
>>     void *platform_iommu_data;
>> } VFIOContainer;
>>
>> Create additional file called vfio_iommu_x86.c with:
>> struct VFIO_Type1_IOMMU {
>>     MemoryListener listener;
>>     QLIST_HEAD(, VFIOGroup) group_list;
>>     QLIST_ENTRY(VFIOContainer) next;
>> };
>> and put all MemoryListener stuff there.
>>
>> For POWER we already have spapr_iommu.c.
>>
>> Wrong direction? :)
> 
> This is actually very similar to what I'm proposing above.  Rather than
> a platform_iommu_data pointer, I'm suggesting that we make a union in
> VFIOContainer.  That allows us to actually dereference the container and
> get back to the fd in the callback functions.  Thanks,
Alex Williamson - July 17, 2012, 2:11 p.m.
On Tue, 2012-07-17 at 17:53 +1000, Alexey Kardashevskiy wrote:
> On 17/07/12 00:21, Alex Williamson wrote:
> > On Sat, 2012-07-14 at 12:34 +1000, Alexey Kardashevskiy wrote:
> >> On 14/07/12 01:07, Alex Williamson wrote:
> >>> On Fri, 2012-07-13 at 17:26 +1000, Alexey Kardashevskiy wrote:
> >>>> It literally does the following:
> >>>>
> >>>> 1. POWERPC IOMMU support (the kernel counterpart is required)
> >>>>
> >>>> 2. The patch assumes that IOAPIC calls are going to be replaced
> >>>> with something generic.
> >>>>
> >>>> 3. vfio_group_iommu_ioctl() has been added to let sPAPR IOMMU
> >>>> handler to call VFIO IOMMU driver.
> >>>>
> >>>> 4. Change sPAPR PHB to scan the PCI bus which is used for
> >>>> the IOMMU-VFIO group. Now it is enough to add the following to
> >>>> the QEMU command line to get VFIO up with all the devices from
> >>>> IOMMU group with id=3:
> >>>> -device spapr-pci-host-bridge,busname=E1000E,buid=0x3,iommu=3,\
> >>>> mem_win_addr=0x230000000000,io_win_addr=0x240000000000,msi_win_addr=0x250000000000
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>>  hw/ppc/Makefile.objs  |    3 ++
> >>>>  hw/spapr.h            |    4 ++
> >>>>  hw/spapr_iommu.c      |   69 ++++++++++++++++++++++++++++++-
> >>>>  hw/spapr_iommu_vfio.h |   49 ++++++++++++++++++++++
> >>>>  hw/spapr_pci.c        |  108 ++++++++++++++++++++++++++++++++++++++++++++++---
> >>>>  hw/spapr_pci.h        |    4 ++
> >>>>  hw/vfio_pci.c         |   30 ++++++++++++++
> >>>>  hw/vfio_pci.h         |    2 +
> >>>>  trace-events          |    1 +
> >>>>  9 files changed, 264 insertions(+), 6 deletions(-)
> >>>>  create mode 100644 hw/spapr_iommu_vfio.h
> >>>>
> >>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> >>>> index f573a95..c46a049 100644
> >>>> --- a/hw/ppc/Makefile.objs
> >>>> +++ b/hw/ppc/Makefile.objs
> >>>> @@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
> >>>>  # Xilinx PPC peripherals
> >>>>  obj-y += xilinx_ethlite.o
> >>>>  
> >>>> +# VFIO PCI device assignment
> >>>> +obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
> >>>> +
> >>>>  obj-y := $(addprefix ../,$(obj-y))
> >>>> diff --git a/hw/spapr.h b/hw/spapr.h
> >>>> index b37f337..26e26f6 100644
> >>>> --- a/hw/spapr.h
> >>>> +++ b/hw/spapr.h
> >>>> @@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> >>>>  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
> >>>>                        DMAContext *dma);
> >>>>  
> >>>> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> >>>> +                         uint64_t *dma32_window_start,
> >>>> +                         uint64_t *dma32_window_size);
> >>>> +
> >>>>  #endif /* !defined (__HW_SPAPR_H__) */
> >>>> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> >>>> index 50c288d..e48ced1 100644
> >>>> --- a/hw/spapr_iommu.c
> >>>> +++ b/hw/spapr_iommu.c
> >>>> @@ -23,6 +23,8 @@
> >>>>  #include "dma.h"
> >>>>  
> >>>>  #include "hw/spapr.h"
> >>>> +#include "hw/spapr_iommu_vfio.h"
> >>>> +#include "hw/vfio_pci.h"
> >>>>  
> >>>>  #include <libfdt.h>
> >>>>  
> >>>> @@ -183,6 +185,67 @@ static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
> >>>>      return 0;
> >>>>  }
> >>>>  
> >>>> +typedef struct sPAPRVFIOTable {
> >>>> +    int group_id;
> >>>> +    uint32_t liobn;
> >>>> +    QLIST_ENTRY(sPAPRVFIOTable) list;
> >>>> +} sPAPRVFIOTable;
> >>>> +
> >>>> +QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
> >>>> +
> >>>> +void spapr_vfio_init_dma(int group_id, uint32_t liobn,
> >>>> +                         uint64_t *dma32_window_start,
> >>>> +                         uint64_t *dma32_window_size)
> >>>> +{
> >>>> +    sPAPRVFIOTable *t;
> >>>> +    struct tce_iommu_info info = { .argsz = sizeof(info) };
> >>>> +
> >>>> +    if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, &info)) {
> >>>> +        perror("SPAPR_TCE_IOMMU_GET_INFO failed");
> >>>> +        return;
> >>>> +    }
> >>>> +    *dma32_window_start = info.dma32_window_start;
> >>>> +    *dma32_window_size = info.dma32_window_size;
> >>>> +
> >>>> +    t = g_malloc0(sizeof(*t));
> >>>> +    t->group_id = group_id;
> >>>> +    t->liobn = liobn;
> >>>> +
> >>>> +    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
> >>>> +}
> >>>> +
> >>>> +static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
> >>>> +{
> >>>> +    sPAPRVFIOTable *t;
> >>>> +    struct tce_iommu_dma_map map = {
> >>>> +        .argsz = sizeof(map),
> >>>> +        .va = 0,
> >>>> +        .dmaaddr = ioba,
> >>>> +    };
> >>>> +
> >>>> +    QLIST_FOREACH(t, &vfio_tce_tables, list) {
> >>>> +        if (t->liobn != liobn) {
> >>>> +            continue;
> >>>> +        }
> >>>> +        if (tce) {
> >>>> +            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
> >>>> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_MAP_DMA,
> >>>> +                                       &map)) {
> >>>> +                perror("TCE_MAP_DMA");
> >>>> +                return H_PARAMETER;
> >>>> +            }
> >>>> +        } else {
> >>>> +            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_UNMAP_DMA,
> >>>> +                                       &map)) {
> >>>> +                perror("TCE_UNMAP_DMA");
> >>>> +                return H_PARAMETER;
> >>>> +            }
> >>>> +        }
> >>>> +        return H_SUCCESS;
> >>>> +    }
> >>>> +    return H_CONTINUE; /* positive non-zero value */
> >>>> +}
> >>>> +
> >>>>  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
> >>>>                                target_ulong opcode, target_ulong *args)
> >>>>  {
> >>>> @@ -200,7 +263,11 @@ static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
> >>>>      ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
> >>>>  
> >>>>      ret = put_tce_emu(liobn, ioba, tce);
> >>>> -    if (0 >= ret) {
> >>>> +    if (ret <= 0) {
> >>>> +        return ret ? H_PARAMETER : H_SUCCESS;
> >>>> +    }
> >>>> +    ret = put_tce_vfio(liobn, ioba, tce);
> >>>> +    if (ret <= 0) {
> >>>>          return ret ? H_PARAMETER : H_SUCCESS;
> >>>>      }
> >>>>  #ifdef DEBUG_TCE
> >>>> diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h
> >>>> new file mode 100644
> >>>> index 0000000..711e3e4
> >>>> --- /dev/null
> >>>> +++ b/hw/spapr_iommu_vfio.h
> >>>> @@ -0,0 +1,49 @@
> >>>> +/*
> >>>> + * Definitions for VFIO IOMMU driver implementing SPAPR TCE.
> >>>> + * This is the copy of the kernel header.
> >>>> + *
> >>>> + * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru>
> >>>> + *
> >>>> + * This library is free software; you can redistribute it and/or
> >>>> + * modify it under the terms of the GNU Lesser General Public
> >>>> + * License as published by the Free Software Foundation; either
> >>>> + * version 2 of the License, or (at your option) any later version.
> >>>> + *
> >>>> + * This library is distributed in the hope that it will be useful,
> >>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >>>> + * Lesser General Public License for more details.
> >>>> + *
> >>>> + * You should have received a copy of the GNU Lesser General Public
> >>>> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> >>>> + */
> >>>> +
> >>>> +#if !defined(__HW_SPAPR_IOMMU_VFIO_H__)
> >>>> +#define __HW_SPAPR_IOMMU_VFIO_H__
> >>>> +
> >>>> +#include "hw/linux-vfio.h"
> >>>> +
> >>>> +#define SPAPR_TCE_IOMMU         2
> >>>> +
> >>>> +struct tce_iommu_info {
> >>>> +    __u32 argsz;
> >>>> +    __u32 flags;
> >>>> +    __u32 dma32_window_start;
> >>>> +    __u32 dma32_window_size;
> >>>> +    __u64 dma64_window_start;
> >>>> +    __u64 dma64_window_size;
> >>>> +};
> >>>> +
> >>>> +#define SPAPR_TCE_IOMMU_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
> >>>> +
> >>>> +struct tce_iommu_dma_map {
> >>>> +    __u32 argsz;
> >>>> +    __u32 flags;
> >>>> +    __u64 va;
> >>>> +    __u64 dmaaddr;
> >>>> +};
> >>>> +
> >>>> +#define SPAPR_TCE_IOMMU_MAP_DMA         _IO(VFIO_TYPE, VFIO_BASE + 13)
> >>>> +#define SPAPR_TCE_IOMMU_UNMAP_DMA       _IO(VFIO_TYPE, VFIO_BASE + 14)
> >>>> +
> >>>> +#endif
> >>>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> >>>> index 014297b..836ec4f 100644
> >>>> --- a/hw/spapr_pci.c
> >>>> +++ b/hw/spapr_pci.c
> >>>> @@ -22,6 +22,9 @@
> >>>>   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >>>>   * THE SOFTWARE.
> >>>>   */
> >>>> +#include <sys/types.h>
> >>>> +#include <dirent.h>
> >>>> +
> >>>>  #include "hw.h"
> >>>>  #include "pci.h"
> >>>>  #include "msi.h"
> >>>> @@ -32,7 +35,6 @@
> >>>>  #include "exec-memory.h"
> >>>>  #include <libfdt.h>
> >>>>  #include "trace.h"
> >>>> -
> >>>>  #include "hw/pci_internals.h"
> >>>>  
> >>>>  /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
> >>>> @@ -440,6 +442,12 @@ static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
> >>>>                   level);
> >>>>  }
> >>>>  
> >>>> +static int pci_spapr_get_irq(void *opaque, int irq_num)
> >>>> +{
> >>>> +    sPAPRPHBState *phb = opaque;
> >>>> +    return phb->lsi_table[irq_num].dt_irq;
> >>>> +}
> >>>> +
> >>>>  static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
> >>>>                                unsigned size)
> >>>>  {
> >>>> @@ -515,6 +523,79 @@ static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
> >>>>      return phb->dma;
> >>>>  }
> >>>>  
> >>>> +static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
> >>>> +{
> >>>> +    char iommupath[256];
> >>>> +    DIR *dirp;
> >>>> +    struct dirent *entry;
> >>>> +
> >>>> +    if (!phb->scan) {
> >>>> +        trace_spapr_pci("Autoscan disabled for ", phb->dtbusname);
> >>>> +        return 0;
> >>>> +    }
> >>>> +
> >>>> +    snprintf(iommupath, sizeof(iommupath),
> >>>> +             "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
> >>>> +    dirp = opendir(iommupath);
> >>>> +
> >>>> +    while ((entry = readdir(dirp)) != NULL) {
> >>>> +        char *tmp;
> >>>> +        FILE *deviceclassfile;
> >>>> +        unsigned deviceclass = 0, domainid, busid, devid, fnid;
> >>>> +        char addr[32];
> >>>> +        DeviceState *dev;
> >>>> +
> >>>> +        if (sscanf(entry->d_name, "%X:%X:%X.%x",
> >>>> +                   &domainid, &busid, &devid, &fnid) != 4) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
> >>>> +        trace_spapr_pci("Reading device class from ", tmp);
> >>>> +
> >>>> +        deviceclassfile = fopen(tmp, "r");
> >>>> +        if (deviceclassfile) {
> >>>> +            fscanf(deviceclassfile, "%x", &deviceclass);
> >>>> +            fclose(deviceclassfile);
> >>>> +        }
> >>>> +        g_free(tmp);
> >>>> +
> >>>> +        if (!deviceclass) {
> >>>> +            continue;
> >>>> +        }
> >>>> +        if ((phb->scan < 2) &&
> >>>> +            ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) {
> >>>> +            /* Skip _any_ bridge */
> >>>> +            continue;
> >>>> +        }
> >>>> +        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
> >>>> +            /* Tweak USB */
> >>>> +            phb->force_addr = 1;
> >>>> +            phb->enable_multifunction = 1;
> >>>> +        }
> >>>> +
> >>>> +        trace_spapr_pci("Creating devicei from ", entry->d_name);
> >>>> +
> >>>> +        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
> >>>> +        if (!dev) {
> >>>> +            fprintf(stderr, "failed to create vfio-pci\n");
> >>>> +            continue;
> >>>> +        }
> >>>> +        qdev_prop_parse(dev, "host", entry->d_name);
> >>>> +        if (phb->force_addr) {
> >>>> +            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
> >>>> +            qdev_prop_parse(dev, "addr", addr);
> >>>> +        }
> >>>> +        if (phb->enable_multifunction) {
> >>>> +            qdev_prop_set_bit(dev, "multifunction", 1);
> >>>> +        }
> >>>> +        qdev_init_nofail(dev);
> >>>> +    }
> >>>> +    closedir(dirp);
> >>>> +
> >>>> +    return 0;
> >>>> +}
> >>>> +
> >>>>  static int spapr_phb_init(SysBusDevice *s)
> >>>>  {
> >>>>      sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
> >>>> @@ -567,15 +648,13 @@ static int spapr_phb_init(SysBusDevice *s)
> >>>>  
> >>>>      bus = pci_register_bus(&phb->host_state.busdev.qdev,
> >>>>                             phb->busname ? phb->busname : phb->dtbusname,
> >>>> -                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
> >>>> +                           pci_spapr_set_irq, pci_spapr_get_irq,
> >>>> +                           pci_spapr_map_irq, phb,
> >>>>                             &phb->memspace, &phb->iospace,
> >>>>                             PCI_DEVFN(0, 0), PCI_NUM_PINS);
> >>>>      phb->host_state.bus = bus;
> >>>>  
> >>>>      phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
> >>>> -    phb->dma_window_start = 0;
> >>>> -    phb->dma_window_size = 0x40000000;
> >>>> -    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
> >>>>      pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
> >>>>  
> >>>>      QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
> >>>> @@ -588,6 +667,21 @@ static int spapr_phb_init(SysBusDevice *s)
> >>>>          }
> >>>>      }
> >>>>  
> >>>> +    if (phb->iommugroupid >= 0) {
> >>>> +        if (spapr_pci_scan_vfio(phb) < 0) {
> >>>> +            return -1;
> >>>> +        }
> >>>> +        spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn,
> >>>> +                            &phb->dma_window_start,
> >>>> +                            &phb->dma_window_size);
> >>>> +        return 0;
> >>>> +    }
> >>>> +
> >>>> +    phb->dma_window_start = 0;
> >>>> +    phb->dma_window_size = 0x40000000;
> >>>> +    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
> >>>> +                                         phb->dma_window_size);
> >>>> +
> >>>>      return 0;
> >>>>  }
> >>>>  
> >>>> @@ -599,6 +693,10 @@ static Property spapr_phb_properties[] = {
> >>>>      DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
> >>>>      DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
> >>>>      DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
> >>>> +    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
> >>>> +    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1),
> >>>> +    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
> >>>> +    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
> >>>>      DEFINE_PROP_END_OF_LIST(),
> >>>>  };
> >>>>  
> >>>> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> >>>> index 145071c..f514823 100644
> >>>> --- a/hw/spapr_pci.h
> >>>> +++ b/hw/spapr_pci.h
> >>>> @@ -57,6 +57,10 @@ typedef struct sPAPRPHBState {
> >>>>          int nvec;
> >>>>      } msi_table[SPAPR_MSIX_MAX_DEVS];
> >>>>  
> >>>> +    int32_t iommugroupid;
> >>>> +    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
> >>>> +    uint8_t enable_multifunction, force_addr;
> >>>> +
> >>>>      QLIST_ENTRY(sPAPRPHBState) list;
> >>>>  } sPAPRPHBState;
> >>>>  
> >>>> diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
> >>>> index 1ac287f..fc84fb4 100644
> >>>> --- a/hw/vfio_pci.c
> >>>> +++ b/hw/vfio_pci.c
> >>>> @@ -1581,6 +1581,24 @@ static int vfio_connect_container(VFIOGroup *group)
> >>>>  
> >>>>          memory_listener_register(&container->listener, get_system_memory());
> >>>>  
> >>>> +    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) {
> >>>> +        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
> >>>> +        if (ret) {
> >>>> +            error_report("vfio: failed to set group container: %s\n",
> >>>> +                         strerror(errno));
> >>>> +            g_free(container);
> >>>> +            close(fd);
> >>>> +            return -1;
> >>>> +        }
> >>>> +
> >>>> +        ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU);
> >>>> +        if (ret) {
> >>>> +            error_report("vfio: failed to set iommu for container: %s\n",
> >>>> +                         strerror(errno));
> >>>> +            g_free(container);
> >>>> +            close(fd);
> >>>> +            return -1;
> >>>> +        }
> >>>
> >>> I think we can still do better.  The x86 code sets up a MemoryListener
> >>> here with data for that embedded into the VFIOContainer.  You don't
> >>> have, need, or want a MemoryListener, but that doesn't mean we can't
> >>> follow the model of registering that this group exists here and setting
> >>> up map/unmap callbacks.
> >>>
> >>> For instance:
> >>>
> >>> in vfio_pci.h:
> >>> struct sPAPRVFIOData {
> >>>     uint64_t dma32_window_start;
> >>>     uint64_t dma64_window_size;
> >>>     ....
> >>>     int (*map)(struct tce_iommu_dma_map *);
> >>>     int (*unmap)(struct tce_iommu_dma_map *);
> >>> };
> 
> >>> appended to the above spapr tce iommu setup above:
> >>>
> >>> struct tce_iommu_info info;
> >>>
> >>> /* the MemoryListener embedded in container becomes a union to hold
> >>>  * iommu specific data. */
> >>> container->u.spapr.data->map = vfio_spapr_tce_map;
> >>> container->u.spapr.data->unmap = vfio_spapr_tce_unmap;
> >>
> >> I could actually reuse x86 callbacks, just wanted to keep POWER ioctls together. The problem here is
> >> getting the DMA window parameters and avoiding MemoryListener.
> > 
> > The callback is pretty trivial though, filling in the data structure and
> > calling the ioctl.  We have different data structures and different
> > ioctls, so probably not a lot to leverage.
> > 
> >>> ioctl(fd, SPAPR_TCE_IOMMU_GET_INFO, &info))did 
> >>>
> >>> container->u.spapr.data->dma32_window_start = info.dma32_window_start;
> >>> container->u.spapr.data->dma32_window_size = info.dma32_window_size;
> >>>
> >>> spapr_register_vfio_container(&container->u.spapr.data)
> >>
> >> I assume it is called within vfio_pci.c as we do not want to access VFIOContainer members from
> >> anywhere but vfio_pci.c.
> >> Or we are changing the approach? I am a bit confused.
> > 
> > Yes, the registration function would be called from vfio_pci at the
> > equivalent place in the spar iommu test as x86 is setting up the memory
> > listener.  That would register the map/unmap function pointers and dma
> > window information.  spapr would then make map/unmap calls using those
> > function pointers, those would be implemented in vfio_pci where they
> > could dereference the container and therefore get to the container fd.
> 
> 
> How do we match this data with the PCI bus or device?

At the point where we're initializing the vfio iommu, you have a
PCIDevice, you can pass the PCIBus from that if you prefer.

> Even if we add IOMMU ID to sPAPRVFIOData and map/unmap callbacks, we
> still do not know which PCI bus it corresponds to if we create devices
> as it is done on x86. sPAPR PHB does not have an IOMMU id/fd and
> cannot get it from VFIO as it would be "exposing a private interface".
> 
> So I will have to specify an IOMMU id for the PCI bus from the command
> line.
> 
> And I really want to be sure that spapr_register_vfio_container() is
> called before spapr_pci.c started populating the device tree with the
> DMA window parameters which is done in spapr_reset() (a reset function
> of sPAPR PHB) now. Ideally I would like to know where the window is
> even before in order to initialize DMAContext, so keeping everything
> in one function spapr_phb_init() seems very right for me.

Disagree, you're limited in getting the iommu info by attaching a group
to a container.  By calling out to spapr code when that happens, you
guarantee the info is valid.  By calling into vfio, you may shift code
around and find out you're now trying to get iommu info before you have
vfio privileges to do so.

> >>> Then vfio_disconnect_container() could call
> >>> spapr_unregister_vfio_container().  Maybe the container contains a
> >>> function pointer to an uninit function so we don't have to ifdef between
> >>> x86 and power.  Does that make sense?  Thanks,
> >>
> >> We also need to pass the numbers from the info struct to spapr_pci.c in order to tell the guest
> >> where the DMA window besides. Another callback? This exactly what I avoided in the kernel when we
> >> decided not to extend IOMMU API with POWER stuff, I would like to have the same here.
> > 
> > This in an internal API, there's no penalty for fixing it later.
> > callbacks and window info are passed in the data structure outlined
> > above.
> > 
> >> In general, what is good in pulling to VFIO as much platform specific stuff as possible?
> >>
> >> I am trying to keep sPAPR IOMMU stuff away and make it easy to add new platforms to VFIO.
> >>
> >> For example, I would rather think of moving the piece of code which checks for SPAPR_TCE_IOMMU out
> >> of VFIO, make it a QEMUMachine callback (together with add-eoi-notifier) as the way IOMMU works is
> >> definitely the specific machine type feature.
> >>
> >> For example, int QEMUMachine::init_iommu(VFIOContainter *container) which would not even try
> >> VFIO_TYPE1_IOMMU on POWER or SPAPR_TCE_IOMMU on x86 as it knows the machine and IOMMU types already.
> > 
> > I don't think tying vfio into the QEMUMachine type has a future.  If you
> > convince Anthony or Michael otherwise, let me know.  
> 
> 
> This is mostly because it has "vfio" in its name.
> If it was something generic like IOMMU-via-fd with no mention of VFIO, then it would have got a chance :)
> Seriously, it already has a set of various unrelated flags, I do not see why not to add something what really belongs to it.
> 
> 
> > Your attempt to
> > keep spapr stuff completely out of vfio is requiring private interfaces
> > to be exposed.  That I think is the wrong direction.
> 
> 
> int vfio_group_iommu_ioctl(int iommu_group, int request, void *data)
> does not expose anything private from VFIO. IOMMU id is not any kind
> of private data. And struct tce_iommu_info is not a data which VFIO
> really wants to know. The calling code (spapr_pci.c) should know the
> IOMMU id either way and only it knows how map/unmap works, lets keep
> it there.

It's completely asynchronous to anything in vfio.  We can't call out to
spapr code and let it know that group went away.  We're exposing a raw
ioctl to all of qemu.  It's a pretty ugly interface (yes, I know I
suggested it).

> I would only add to VFIO API this:
> int vfio_group_iommu_init(int iommu_group)
> to make a group initialization explicit.
> 
> What is wrong with such a solution?

And when do you propose calling that?  You can *only* init the iommu
once a group is attached to a container.  This gives vfio the privilege
it needs to init the iommu.  vfio_group_iommu_init therefore logically
happens at the point when we set the iommu, which is where I'm
suggesting the callout happen, just like x86 does with the memory
listener.  Thanks,

Alex

> >> And do something like:
> >> typedef struct VFIOContainer {
> >>     int fd;
> >>     void *platform_iommu_data;
> >> } VFIOContainer;
> >>
> >> Create additional file called vfio_iommu_x86.c with:
> >> struct VFIO_Type1_IOMMU {
> >>     MemoryListener listener;
> >>     QLIST_HEAD(, VFIOGroup) group_list;
> >>     QLIST_ENTRY(VFIOContainer) next;
> >> };
> >> and put all MemoryListener stuff there.
> >>
> >> For POWER we already have spapr_iommu.c.
> >>
> >> Wrong direction? :)
> > 
> > This is actually very similar to what I'm proposing above.  Rather than
> > a platform_iommu_data pointer, I'm suggesting that we make a union in
> > VFIOContainer.  That allows us to actually dereference the container and
> > get back to the fd in the callback functions.  Thanks,
> 
>

Patch

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index f573a95..c46a049 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -25,4 +25,7 @@  obj-$(CONFIG_FDT) += ../device_tree.o
 # Xilinx PPC peripherals
 obj-y += xilinx_ethlite.o
 
+# VFIO PCI device assignment
+obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
+
 obj-y := $(addprefix ../,$(obj-y))
diff --git a/hw/spapr.h b/hw/spapr.h
index b37f337..26e26f6 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -340,4 +340,8 @@  int spapr_dma_dt(void *fdt, int node_off, const char *propname,
 int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
                       DMAContext *dma);
 
+void spapr_vfio_init_dma(int group_id, uint32_t liobn,
+                         uint64_t *dma32_window_start,
+                         uint64_t *dma32_window_size);
+
 #endif /* !defined (__HW_SPAPR_H__) */
diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
index 50c288d..e48ced1 100644
--- a/hw/spapr_iommu.c
+++ b/hw/spapr_iommu.c
@@ -23,6 +23,8 @@ 
 #include "dma.h"
 
 #include "hw/spapr.h"
+#include "hw/spapr_iommu_vfio.h"
+#include "hw/vfio_pci.h"
 
 #include <libfdt.h>
 
@@ -183,6 +185,67 @@  static int put_tce_emu(target_ulong liobn, target_ulong ioba, target_ulong tce)
     return 0;
 }
 
+typedef struct sPAPRVFIOTable {
+    int group_id;
+    uint32_t liobn;
+    QLIST_ENTRY(sPAPRVFIOTable) list;
+} sPAPRVFIOTable;
+
+QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
+
+void spapr_vfio_init_dma(int group_id, uint32_t liobn,
+                         uint64_t *dma32_window_start,
+                         uint64_t *dma32_window_size)
+{
+    sPAPRVFIOTable *t;
+    struct tce_iommu_info info = { .argsz = sizeof(info) };
+
+    if (vfio_group_iommu_ioctl(group_id, SPAPR_TCE_IOMMU_GET_INFO, &info)) {
+        perror("SPAPR_TCE_IOMMU_GET_INFO failed");
+        return;
+    }
+    *dma32_window_start = info.dma32_window_start;
+    *dma32_window_size = info.dma32_window_size;
+
+    t = g_malloc0(sizeof(*t));
+    t->group_id = group_id;
+    t->liobn = liobn;
+
+    QLIST_INSERT_HEAD(&vfio_tce_tables, t, list);
+}
+
+static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
+{
+    sPAPRVFIOTable *t;
+    struct tce_iommu_dma_map map = {
+        .argsz = sizeof(map),
+        .va = 0,
+        .dmaaddr = ioba,
+    };
+
+    QLIST_FOREACH(t, &vfio_tce_tables, list) {
+        if (t->liobn != liobn) {
+            continue;
+        }
+        if (tce) {
+            map.va = (uintptr_t)qemu_get_ram_ptr(tce & ~SPAPR_TCE_PAGE_MASK);
+            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_MAP_DMA,
+                                       &map)) {
+                perror("TCE_MAP_DMA");
+                return H_PARAMETER;
+            }
+        } else {
+            if (vfio_group_iommu_ioctl(t->group_id, SPAPR_TCE_IOMMU_UNMAP_DMA,
+                                       &map)) {
+                perror("TCE_UNMAP_DMA");
+                return H_PARAMETER;
+            }
+        }
+        return H_SUCCESS;
+    }
+    return H_CONTINUE; /* positive non-zero value */
+}
+
 static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
                               target_ulong opcode, target_ulong *args)
 {
@@ -200,7 +263,11 @@  static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
     ioba &= ~(SPAPR_TCE_PAGE_SIZE - 1);
 
     ret = put_tce_emu(liobn, ioba, tce);
-    if (0 >= ret) {
+    if (ret <= 0) {
+        return ret ? H_PARAMETER : H_SUCCESS;
+    }
+    ret = put_tce_vfio(liobn, ioba, tce);
+    if (ret <= 0) {
         return ret ? H_PARAMETER : H_SUCCESS;
     }
 #ifdef DEBUG_TCE
diff --git a/hw/spapr_iommu_vfio.h b/hw/spapr_iommu_vfio.h
new file mode 100644
index 0000000..711e3e4
--- /dev/null
+++ b/hw/spapr_iommu_vfio.h
@@ -0,0 +1,49 @@ 
+/*
+ * Definitions for VFIO IOMMU driver implementing SPAPR TCE.
+ * This is the copy of the kernel header.
+ *
+ * Copyright (c) 2012 Alexey Kardashevskiy <aik@olabs.ru>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if !defined(__HW_SPAPR_IOMMU_VFIO_H__)
+#define __HW_SPAPR_IOMMU_VFIO_H__
+
+#include "hw/linux-vfio.h"
+
+#define SPAPR_TCE_IOMMU         2
+
+struct tce_iommu_info {
+    __u32 argsz;
+    __u32 flags;
+    __u32 dma32_window_start;
+    __u32 dma32_window_size;
+    __u64 dma64_window_start;
+    __u64 dma64_window_size;
+};
+
+#define SPAPR_TCE_IOMMU_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct tce_iommu_dma_map {
+    __u32 argsz;
+    __u32 flags;
+    __u64 va;
+    __u64 dmaaddr;
+};
+
+#define SPAPR_TCE_IOMMU_MAP_DMA         _IO(VFIO_TYPE, VFIO_BASE + 13)
+#define SPAPR_TCE_IOMMU_UNMAP_DMA       _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+#endif
diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
index 014297b..836ec4f 100644
--- a/hw/spapr_pci.c
+++ b/hw/spapr_pci.c
@@ -22,6 +22,9 @@ 
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <sys/types.h>
+#include <dirent.h>
+
 #include "hw.h"
 #include "pci.h"
 #include "msi.h"
@@ -32,7 +35,6 @@ 
 #include "exec-memory.h"
 #include <libfdt.h>
 #include "trace.h"
-
 #include "hw/pci_internals.h"
 
 /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */
@@ -440,6 +442,12 @@  static void pci_spapr_set_irq(void *opaque, int irq_num, int level)
                  level);
 }
 
+static int pci_spapr_get_irq(void *opaque, int irq_num)
+{
+    sPAPRPHBState *phb = opaque;
+    return phb->lsi_table[irq_num].dt_irq;
+}
+
 static uint64_t spapr_io_read(void *opaque, target_phys_addr_t addr,
                               unsigned size)
 {
@@ -515,6 +523,79 @@  static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
     return phb->dma;
 }
 
+static int spapr_pci_scan_vfio(sPAPRPHBState *phb)
+{
+    char iommupath[256];
+    DIR *dirp;
+    struct dirent *entry;
+
+    if (!phb->scan) {
+        trace_spapr_pci("Autoscan disabled for ", phb->dtbusname);
+        return 0;
+    }
+
+    snprintf(iommupath, sizeof(iommupath),
+             "/sys/kernel/iommu_groups/%d/devices/", phb->iommugroupid);
+    dirp = opendir(iommupath);
+
+    while ((entry = readdir(dirp)) != NULL) {
+        char *tmp;
+        FILE *deviceclassfile;
+        unsigned deviceclass = 0, domainid, busid, devid, fnid;
+        char addr[32];
+        DeviceState *dev;
+
+        if (sscanf(entry->d_name, "%X:%X:%X.%x",
+                   &domainid, &busid, &devid, &fnid) != 4) {
+            continue;
+        }
+
+        tmp = g_strdup_printf("%s%s/class", iommupath, entry->d_name);
+        trace_spapr_pci("Reading device class from ", tmp);
+
+        deviceclassfile = fopen(tmp, "r");
+        if (deviceclassfile) {
+            fscanf(deviceclassfile, "%x", &deviceclass);
+            fclose(deviceclassfile);
+        }
+        g_free(tmp);
+
+        if (!deviceclass) {
+            continue;
+        }
+        if ((phb->scan < 2) &&
+            ((deviceclass >> 16) == (PCI_CLASS_BRIDGE_OTHER >> 8))) {
+            /* Skip _any_ bridge */
+            continue;
+        }
+        if ((deviceclass == 0xc0310) || (deviceclass == 0xc0320)) {
+            /* Tweak USB */
+            phb->force_addr = 1;
+            phb->enable_multifunction = 1;
+        }
+
+        trace_spapr_pci("Creating devicei from ", entry->d_name);
+
+        dev = qdev_create(&phb->host_state.bus->qbus, "vfio-pci");
+        if (!dev) {
+            fprintf(stderr, "failed to create vfio-pci\n");
+            continue;
+        }
+        qdev_prop_parse(dev, "host", entry->d_name);
+        if (phb->force_addr) {
+            snprintf(addr, sizeof(addr), "%x.%x", devid, fnid);
+            qdev_prop_parse(dev, "addr", addr);
+        }
+        if (phb->enable_multifunction) {
+            qdev_prop_set_bit(dev, "multifunction", 1);
+        }
+        qdev_init_nofail(dev);
+    }
+    closedir(dirp);
+
+    return 0;
+}
+
 static int spapr_phb_init(SysBusDevice *s)
 {
     sPAPRPHBState *phb = DO_UPCAST(sPAPRPHBState, host_state.busdev, s);
@@ -567,15 +648,13 @@  static int spapr_phb_init(SysBusDevice *s)
 
     bus = pci_register_bus(&phb->host_state.busdev.qdev,
                            phb->busname ? phb->busname : phb->dtbusname,
-                           pci_spapr_set_irq, NULL, pci_spapr_map_irq, phb,
+                           pci_spapr_set_irq, pci_spapr_get_irq,
+                           pci_spapr_map_irq, phb,
                            &phb->memspace, &phb->iospace,
                            PCI_DEVFN(0, 0), PCI_NUM_PINS);
     phb->host_state.bus = bus;
 
     phb->dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus) << 16);
-    phb->dma_window_start = 0;
-    phb->dma_window_size = 0x40000000;
-    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn, phb->dma_window_size);
     pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
 
     QLIST_INSERT_HEAD(&spapr->phbs, phb, list);
@@ -588,6 +667,21 @@  static int spapr_phb_init(SysBusDevice *s)
         }
     }
 
+    if (phb->iommugroupid >= 0) {
+        if (spapr_pci_scan_vfio(phb) < 0) {
+            return -1;
+        }
+        spapr_vfio_init_dma(phb->iommugroupid, phb->dma_liobn,
+                            &phb->dma_window_start,
+                            &phb->dma_window_size);
+        return 0;
+    }
+
+    phb->dma_window_start = 0;
+    phb->dma_window_size = 0x40000000;
+    phb->dma = spapr_tce_new_dma_context(phb->dma_liobn,
+                                         phb->dma_window_size);
+
     return 0;
 }
 
@@ -599,6 +693,10 @@  static Property spapr_phb_properties[] = {
     DEFINE_PROP_HEX64("io_win_addr", sPAPRPHBState, io_win_addr, 0),
     DEFINE_PROP_HEX64("io_win_size", sPAPRPHBState, io_win_size, 0x10000),
     DEFINE_PROP_HEX64("msi_win_addr", sPAPRPHBState, msi_win_addr, 0),
+    DEFINE_PROP_INT32("iommu", sPAPRPHBState, iommugroupid, -1),
+    DEFINE_PROP_UINT8("scan", sPAPRPHBState, scan, 1),
+    DEFINE_PROP_UINT8("mf", sPAPRPHBState, enable_multifunction, 0),
+    DEFINE_PROP_UINT8("forceaddr", sPAPRPHBState, force_addr, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
index 145071c..f514823 100644
--- a/hw/spapr_pci.h
+++ b/hw/spapr_pci.h
@@ -57,6 +57,10 @@  typedef struct sPAPRPHBState {
         int nvec;
     } msi_table[SPAPR_MSIX_MAX_DEVS];
 
+    int32_t iommugroupid;
+    uint8_t scan; /* 0 don't scan 1 scan only devices 2 scan everything */
+    uint8_t enable_multifunction, force_addr;
+
     QLIST_ENTRY(sPAPRPHBState) list;
 } sPAPRPHBState;
 
diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index 1ac287f..fc84fb4 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -1581,6 +1581,24 @@  static int vfio_connect_container(VFIOGroup *group)
 
         memory_listener_register(&container->listener, get_system_memory());
 
+    } else if (ioctl(fd, VFIO_CHECK_EXTENSION, SPAPR_TCE_IOMMU)) {
+        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
+        if (ret) {
+            error_report("vfio: failed to set group container: %s\n",
+                         strerror(errno));
+            g_free(container);
+            close(fd);
+            return -1;
+        }
+
+        ret = ioctl(fd, VFIO_SET_IOMMU, SPAPR_TCE_IOMMU);
+        if (ret) {
+            error_report("vfio: failed to set iommu for container: %s\n",
+                         strerror(errno));
+            g_free(container);
+            close(fd);
+            return -1;
+        }
     } else {
         error_report("vfio: No available IOMMU models\n");
         g_free(container);
@@ -2005,3 +2023,15 @@  static void register_vfio_pci_dev_type(void)
 }
 
 type_init(register_vfio_pci_dev_type)
+
+int vfio_group_iommu_ioctl(int iommu_group, int request, void *data)
+{
+    VFIOGroup *group;
+
+    group = vfio_get_group(iommu_group);
+    if (!group->container) {
+        return -EINVAL;
+    }
+
+    return ioctl(group->container->fd, request, data);
+}
diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
index 226607c..f44ff07 100644
--- a/hw/vfio_pci.h
+++ b/hw/vfio_pci.h
@@ -105,4 +105,6 @@  typedef struct VFIOGroup {
 #define VFIO_FLAG_IOMMU_SHARED_BIT 0
 #define VFIO_FLAG_IOMMU_SHARED (1U << VFIO_FLAG_UIOMMU_SHARED_BIT)
 
+int vfio_group_iommu_ioctl(int iommu_group, int request, void *data);
+
 #endif /* __VFIO_H__ */
diff --git a/trace-events b/trace-events
index e548f86..9100591 100644
--- a/trace-events
+++ b/trace-events
@@ -848,6 +848,7 @@  qxl_render_guest_primary_resized(int32_t width, int32_t height, int32_t stride,
 qxl_render_update_area_done(void *cookie) "%p"
 
 # hw/spapr_pci.c
+spapr_pci(const char *msg1, const char *msg2) "%s%s"
 spapr_pci_msi(const char *msg, uint32_t n, uint32_t ca) "%s (device#%d, cfg=%x)"
 spapr_pci_msi_setup(const char *name, unsigned vector, uint64_t addr) "dev\"%s\" vector %u, addr=%"PRIx64
 spapr_pci_rtas_ibm_change_msi(unsigned func, unsigned req) "func %u, requested %u"