Patchwork [RFC,v1,4/7] pci: switch iommu to using the memory API

login
register
mail settings
Submitter Avi Kivity
Date Oct. 11, 2012, 1:27 p.m.
Message ID <1349962023-560-5-git-send-email-avi@redhat.com>
Download mbox | patch
Permalink /patch/190908/
State New
Headers show

Comments

Avi Kivity - Oct. 11, 2012, 1:27 p.m.
Instead of requesting a DMAContext from the bus implementation, use a
MemoryRegion.  This can be initialized using memory_region_init_iommu()
(or memory_region_init_alias() for simple, static translations).

Add a destructor, since setups that have per-device translations will
need to return a different iommu region for every device.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 hw/pci.c           | 59 +++++++++++++++++++++++++++++++++---------------------
 hw/pci.h           |  7 +++++--
 hw/pci_internals.h |  5 +++--
 hw/spapr.h         |  2 ++
 hw/spapr_iommu.c   | 35 ++++++++++++++------------------
 hw/spapr_pci.c     | 26 ++++++++++++++++++++----
 hw/spapr_pci.h     |  1 +
 7 files changed, 84 insertions(+), 51 deletions(-)
Paolo Bonzini - Oct. 11, 2012, 1:53 p.m.
Il 11/10/2012 15:27, Avi Kivity ha scritto:
> -static int spapr_tce_translate(DMAContext *dma,
> -                               dma_addr_t addr,
> -                               target_phys_addr_t *paddr,
> -                               target_phys_addr_t *len,
> -                               DMADirection dir)
> +IOMMUTLBEntry spapr_tce_translate(DMAContext *dma, target_phys_addr_t addr,
> +                                  bool is_write)
>  {
>      sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, dma);
> -    enum sPAPRTCEAccess access = (dir == DMA_DIRECTION_FROM_DEVICE)
> -        ? SPAPR_TCE_WO : SPAPR_TCE_RO;
> +    enum sPAPRTCEAccess access = is_write ? SPAPR_TCE_WO : SPAPR_TCE_RO;
>      uint64_t tce;
>  
>  #ifdef DEBUG_TCE
> @@ -84,29 +80,27 @@ static int spapr_tce_translate(DMAContext *dma,
>  #ifdef DEBUG_TCE
>          fprintf(stderr, "spapr_tce_translate out of bounds\n");
>  #endif
> -        return -EFAULT;
> +        return (IOMMUTLBEntry) { .valid = false };
>      }
>  
>      tce = tcet->table[addr >> SPAPR_TCE_PAGE_SHIFT].tce;
>  
>      /* Check TCE */
>      if (!(tce & access)) {
> -        return -EPERM;
> +        return (IOMMUTLBEntry) { .valid = false };
>      }
>  
> -    /* How much til end of page ? */
> -    *len = ((~addr) & SPAPR_TCE_PAGE_MASK) + 1;
> -
> -    /* Translate */
> -    *paddr = (tce & ~SPAPR_TCE_PAGE_MASK) |
> -        (addr & SPAPR_TCE_PAGE_MASK);
> -
>  #ifdef DEBUG_TCE
>      fprintf(stderr, " ->  *paddr=0x" TARGET_FMT_plx ", *len=0x"
> -            TARGET_FMT_plx "\n", *paddr, *len);
> +            TARGET_FMT_plx "\n", (tce & ~SPAPR_TCE_PAGE_MASK), SPAPR_TCE_PAGE_MASK + 1);
>  #endif
>  
> -    return 0;
> +    return (IOMMUTLBEntry) {
> +        .device_addr = addr & SPAPR_TCE_PAGE_MASK,
> +        .translated_addr = (tce & ~SPAPR_TCE_PAGE_MASK),
> +        .addr_mask = SPAPR_TCE_PAGE_MASK,
> +        .valid = true,
> +    };
>  }
>  
>  DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size)
> @@ -118,7 +112,7 @@ DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size)
>      }
>  
>      tcet = g_malloc0(sizeof(*tcet));
> -    dma_context_init(&tcet->dma, &address_space_memory, spapr_tce_translate, NULL, NULL);
> +    dma_context_init(&tcet->dma, &address_space_memory, NULL, NULL, NULL);
>  
>      tcet->liobn = liobn;
>      tcet->window_size = window_size;
> @@ -253,7 +247,8 @@ int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>          return 0;
>      }
>  
> -    if (iommu->translate == spapr_tce_translate) {
> +    /* FIXME: WHAT?? */
> +    if (true) {
>          sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, iommu);
>          return spapr_dma_dt(fdt, node_off, propname,
>                  tcet->liobn, 0, tcet->window_size);
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 661c05b..24f5b46 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -506,14 +506,30 @@ static void spapr_msi_write(void *opaque, target_phys_addr_t addr,
>  /*
>   * PHB PCI device
>   */
> -static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
> -                                            int devfn)
> +static MemoryRegion *spapr_pci_dma_iommu_new(PCIBus *bus, void *opaque, int devfn)
>  {
>      sPAPRPHBState *phb = opaque;
>  
> -    return phb->dma;
> +    return &phb->iommu;
>  }
>  
> +static void spapr_pci_dma_iommu_destroy(MemoryRegion *iommu)
> +{
> +    /* iommu is shared among devices, do nothing */
> +}
> +
> +static IOMMUTLBEntry spapr_phb_translate(MemoryRegion *iommu, target_phys_addr_t addr,
> +                                         bool is_write)
> +{
> +    sPAPRPHBState *phb = container_of(iommu, sPAPRPHBState, iommu);
> +
> +    return spapr_tce_translate(phb->dma, addr, is_write);
> +}

IIUC, sPAPRTCETable could drop the DMAContext field and just include the
actual translation info.  spapr_tce_new_dma_context becomes
spapr_tce_new and returns an opaque sPAPRTCETable struct that is passed
back to spapr_tce_translate.

And DMAContext can disappear and will be replaced with just an
AddressSpace *.

I like it!

Paolo
Avi Kivity - Oct. 11, 2012, 1:56 p.m.
On 10/11/2012 03:53 PM, Paolo Bonzini wrote:
> Il 11/10/2012 15:27, Avi Kivity ha scritto:
>> -static int spapr_tce_translate(DMAContext *dma,
>> -                               dma_addr_t addr,
>> -                               target_phys_addr_t *paddr,
>> -                               target_phys_addr_t *len,
>> -                               DMADirection dir)
>> +IOMMUTLBEntry spapr_tce_translate(DMAContext *dma, target_phys_addr_t addr,
>> +                                  bool is_write)
>>  {
>>      sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, dma);
>> -    enum sPAPRTCEAccess access = (dir == DMA_DIRECTION_FROM_DEVICE)
>> -        ? SPAPR_TCE_WO : SPAPR_TCE_RO;
>> +    enum sPAPRTCEAccess access = is_write ? SPAPR_TCE_WO : SPAPR_TCE_RO;
>>      uint64_t tce;
>>  
>>  #ifdef DEBUG_TCE
>> @@ -84,29 +80,27 @@ static int spapr_tce_translate(DMAContext *dma,
>>  #ifdef DEBUG_TCE
>>          fprintf(stderr, "spapr_tce_translate out of bounds\n");
>>  #endif
>> -        return -EFAULT;
>> +        return (IOMMUTLBEntry) { .valid = false };
>>      }
>>  
>>      tce = tcet->table[addr >> SPAPR_TCE_PAGE_SHIFT].tce;
>>  
>>      /* Check TCE */
>>      if (!(tce & access)) {
>> -        return -EPERM;
>> +        return (IOMMUTLBEntry) { .valid = false };
>>      }
>>  
>> -    /* How much til end of page ? */
>> -    *len = ((~addr) & SPAPR_TCE_PAGE_MASK) + 1;
>> -
>> -    /* Translate */
>> -    *paddr = (tce & ~SPAPR_TCE_PAGE_MASK) |
>> -        (addr & SPAPR_TCE_PAGE_MASK);
>> -
>>  #ifdef DEBUG_TCE
>>      fprintf(stderr, " ->  *paddr=0x" TARGET_FMT_plx ", *len=0x"
>> -            TARGET_FMT_plx "\n", *paddr, *len);
>> +            TARGET_FMT_plx "\n", (tce & ~SPAPR_TCE_PAGE_MASK), SPAPR_TCE_PAGE_MASK + 1);
>>  #endif
>>  
>> -    return 0;
>> +    return (IOMMUTLBEntry) {
>> +        .device_addr = addr & SPAPR_TCE_PAGE_MASK,
>> +        .translated_addr = (tce & ~SPAPR_TCE_PAGE_MASK),
>> +        .addr_mask = SPAPR_TCE_PAGE_MASK,
>> +        .valid = true,
>> +    };
>>  }
>>  
>>  DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size)
>> @@ -118,7 +112,7 @@ DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size)
>>      }
>>  
>>      tcet = g_malloc0(sizeof(*tcet));
>> -    dma_context_init(&tcet->dma, &address_space_memory, spapr_tce_translate, NULL, NULL);
>> +    dma_context_init(&tcet->dma, &address_space_memory, NULL, NULL, NULL);
>>  
>>      tcet->liobn = liobn;
>>      tcet->window_size = window_size;
>> @@ -253,7 +247,8 @@ int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>>          return 0;
>>      }
>>  
>> -    if (iommu->translate == spapr_tce_translate) {
>> +    /* FIXME: WHAT?? */
>> +    if (true) {
>>          sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, iommu);
>>          return spapr_dma_dt(fdt, node_off, propname,
>>                  tcet->liobn, 0, tcet->window_size);
>> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
>> index 661c05b..24f5b46 100644
>> --- a/hw/spapr_pci.c
>> +++ b/hw/spapr_pci.c
>> @@ -506,14 +506,30 @@ static void spapr_msi_write(void *opaque, target_phys_addr_t addr,
>>  /*
>>   * PHB PCI device
>>   */
>> -static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
>> -                                            int devfn)
>> +static MemoryRegion *spapr_pci_dma_iommu_new(PCIBus *bus, void *opaque, int devfn)
>>  {
>>      sPAPRPHBState *phb = opaque;
>>  
>> -    return phb->dma;
>> +    return &phb->iommu;
>>  }
>>  
>> +static void spapr_pci_dma_iommu_destroy(MemoryRegion *iommu)
>> +{
>> +    /* iommu is shared among devices, do nothing */
>> +}
>> +
>> +static IOMMUTLBEntry spapr_phb_translate(MemoryRegion *iommu, target_phys_addr_t addr,
>> +                                         bool is_write)
>> +{
>> +    sPAPRPHBState *phb = container_of(iommu, sPAPRPHBState, iommu);
>> +
>> +    return spapr_tce_translate(phb->dma, addr, is_write);
>> +}
> 
> IIUC, sPAPRTCETable could drop the DMAContext field and just include the
> actual translation info.  spapr_tce_new_dma_context becomes
> spapr_tce_new and returns an opaque sPAPRTCETable struct that is passed
> back to spapr_tce_translate.

Maybe, I just typed in code until it compiled, then forgot to copy Alex.
 I have no idea how to test it, so I kept the changes minimal.

> 
> And DMAContext can disappear and will be replaced with just an
> AddressSpace *.

Indeed that's the plan.  address_space_map() and dma_memory_map() simply
duplicate each other.  The only thing remaining is dma_memory_set().

> I like it!

Me too, except when I'm coding it.
Blue Swirl - Oct. 13, 2012, 9:13 a.m.
On Thu, Oct 11, 2012 at 1:27 PM, Avi Kivity <avi@redhat.com> wrote:
> Instead of requesting a DMAContext from the bus implementation, use a
> MemoryRegion.  This can be initialized using memory_region_init_iommu()
> (or memory_region_init_alias() for simple, static translations).
>
> Add a destructor, since setups that have per-device translations will
> need to return a different iommu region for every device.
>
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  hw/pci.c           | 59 +++++++++++++++++++++++++++++++++---------------------
>  hw/pci.h           |  7 +++++--
>  hw/pci_internals.h |  5 +++--
>  hw/spapr.h         |  2 ++
>  hw/spapr_iommu.c   | 35 ++++++++++++++------------------
>  hw/spapr_pci.c     | 26 ++++++++++++++++++++----
>  hw/spapr_pci.h     |  1 +
>  7 files changed, 84 insertions(+), 51 deletions(-)
>
> diff --git a/hw/pci.c b/hw/pci.c
> index 7adf61b..02e1989 100644
> --- a/hw/pci.c
> +++ b/hw/pci.c
> @@ -274,6 +274,21 @@ int pci_find_domain(const PCIBus *bus)
>      return -1;
>  }
>
> +static MemoryRegion *pci_default_iommu(PCIBus *bus, void *opaque, int devfn)
> +{
> +    MemoryRegion *mr = g_new(MemoryRegion, 1);
> +
> +    /* FIXME: inherit memory region from bus creator */
> +    memory_region_init_alias(mr, "iommu-nop", get_system_memory(), 0, INT64_MAX);
> +    return mr;
> +}
> +
> +static void pci_default_iommu_dtor(MemoryRegion *mr)
> +{
> +    memory_region_destroy(mr);
> +    g_free(mr);
> +}
> +
>  void pci_bus_new_inplace(PCIBus *bus, DeviceState *parent,
>                           const char *name,
>                           MemoryRegion *address_space_mem,
> @@ -285,6 +300,7 @@ void pci_bus_new_inplace(PCIBus *bus, DeviceState *parent,
>      bus->devfn_min = devfn_min;
>      bus->address_space_mem = address_space_mem;
>      bus->address_space_io = address_space_io;
> +    pci_setup_iommu(bus, pci_default_iommu, pci_default_iommu_dtor, NULL);
>
>      /* host bridge */
>      QLIST_INIT(&bus->child);
> @@ -775,21 +791,16 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
>                       PCI_SLOT(devfn), PCI_FUNC(devfn), name, bus->devices[devfn]->name);
>          return NULL;
>      }
> +
>      pci_dev->bus = bus;
> -    if (bus->dma_context_fn) {
> -        pci_dev->dma = bus->dma_context_fn(bus, bus->dma_context_opaque, devfn);
> -    } else {
> -        /* FIXME: Make dma_context_fn use MemoryRegions instead, so this path is
> -         * taken unconditionally */
> -        /* FIXME: inherit memory region from bus creator */
> -        memory_region_init_alias(&pci_dev->bus_master_enable_region, "bus master",
> -                                 get_system_memory(), 0,
> -                                 memory_region_size(get_system_memory()));
> -        memory_region_set_enabled(&pci_dev->bus_master_enable_region, false);
> -        address_space_init(&pci_dev->bus_master_as, &pci_dev->bus_master_enable_region);
> -        pci_dev->dma = g_new(DMAContext, 1);
> -        dma_context_init(pci_dev->dma, &pci_dev->bus_master_as, NULL, NULL, NULL);
> -    }
> +    pci_dev->iommu = bus->iommu_fn(bus, bus->iommu_opaque, devfn);
> +    memory_region_init_alias(&pci_dev->bus_master_enable_region, "bus master",
> +                             pci_dev->iommu, 0, memory_region_size(pci_dev->iommu));
> +    memory_region_set_enabled(&pci_dev->bus_master_enable_region, false);
> +    address_space_init(&pci_dev->bus_master_as, &pci_dev->bus_master_enable_region);
> +    pci_dev->dma = g_new(DMAContext, 1);
> +    dma_context_init(pci_dev->dma, &pci_dev->bus_master_as, NULL, NULL, NULL);
> +
>      pci_dev->devfn = devfn;
>      pstrcpy(pci_dev->name, sizeof(pci_dev->name), name);
>      pci_dev->irq_state = 0;
> @@ -843,12 +854,12 @@ static void do_pci_unregister_device(PCIDevice *pci_dev)
>      pci_dev->bus->devices[pci_dev->devfn] = NULL;
>      pci_config_free(pci_dev);
>
> -    if (!pci_dev->bus->dma_context_fn) {
> -        address_space_destroy(&pci_dev->bus_master_as);
> -        memory_region_destroy(&pci_dev->bus_master_enable_region);
> -        g_free(pci_dev->dma);
> -        pci_dev->dma = NULL;
> -    }
> +    address_space_destroy(&pci_dev->bus_master_as);
> +    memory_region_del_subregion(&pci_dev->bus_master_enable_region, pci_dev->iommu);
> +    pci_dev->bus->iommu_dtor_fn(pci_dev->iommu);
> +    memory_region_destroy(&pci_dev->bus_master_enable_region);
> +    g_free(pci_dev->dma);
> +    pci_dev->dma = NULL;
>  }
>
>  static void pci_unregister_io_regions(PCIDevice *pci_dev)
> @@ -2092,10 +2103,12 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
>      k->props = pci_props;
>  }
>
> -void pci_setup_iommu(PCIBus *bus, PCIDMAContextFunc fn, void *opaque)
> +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, PCIIOMMUDestructorFunc dtor,
> +                     void *opaque)
>  {
> -    bus->dma_context_fn = fn;
> -    bus->dma_context_opaque = opaque;
> +    bus->iommu_fn = fn;
> +    bus->iommu_dtor_fn = dtor;
> +    bus->iommu_opaque = opaque;
>  }
>
>  static TypeInfo pci_device_type_info = {
> diff --git a/hw/pci.h b/hw/pci.h
> index a65e490..370354a 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -213,6 +213,7 @@ struct PCIDevice {
>      PCIIORegion io_regions[PCI_NUM_REGIONS];
>      AddressSpace bus_master_as;
>      MemoryRegion bus_master_enable_region;
> +    MemoryRegion *iommu;
>      DMAContext *dma;
>
>      /* do not access the following fields */
> @@ -351,9 +352,11 @@ int pci_read_devaddr(Monitor *mon, const char *addr, int *domp, int *busp,
>
>  void pci_device_deassert_intx(PCIDevice *dev);
>
> -typedef DMAContext *(*PCIDMAContextFunc)(PCIBus *, void *, int);
> +typedef MemoryRegion *(*PCIIOMMUFunc)(PCIBus *, void *, int);
> +typedef void (*PCIIOMMUDestructorFunc)(MemoryRegion *mr);
>
> -void pci_setup_iommu(PCIBus *bus, PCIDMAContextFunc fn, void *opaque);
> +void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, PCIIOMMUDestructorFunc dtor,
> +                     void *opaque);
>
>  static inline void
>  pci_set_byte(uint8_t *config, uint8_t val)
> diff --git a/hw/pci_internals.h b/hw/pci_internals.h
> index c931b64..040508f 100644
> --- a/hw/pci_internals.h
> +++ b/hw/pci_internals.h
> @@ -17,8 +17,9 @@
>
>  struct PCIBus {
>      BusState qbus;
> -    PCIDMAContextFunc dma_context_fn;
> -    void *dma_context_opaque;
> +    PCIIOMMUFunc iommu_fn;
> +    PCIIOMMUDestructorFunc iommu_dtor_fn;
> +    void *iommu_opaque;

Maybe the opaque could be avoided (in later patches) by clever use of
container_of() by the functions to get the parent structure?

>      uint8_t devfn_min;
>      pci_set_irq_fn set_irq;
>      pci_map_irq_fn map_irq;
> diff --git a/hw/spapr.h b/hw/spapr.h
> index ac34a17..452efba1 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -334,6 +334,8 @@ typedef struct sPAPRTCE {
>  #define SPAPR_PCI_BASE_LIOBN    0x80000000
>
>  void spapr_iommu_init(void);
> +IOMMUTLBEntry spapr_tce_translate(DMAContext *dma, target_phys_addr_t addr,
> +                                  bool is_write);
>  DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size);
>  void spapr_tce_free(DMAContext *dma);
>  int spapr_dma_dt(void *fdt, int node_off, const char *propname,
> diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
> index 54798a3..79e35d1 100644
> --- a/hw/spapr_iommu.c
> +++ b/hw/spapr_iommu.c
> @@ -63,15 +63,11 @@ static sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn)
>      return NULL;
>  }
>
> -static int spapr_tce_translate(DMAContext *dma,
> -                               dma_addr_t addr,
> -                               target_phys_addr_t *paddr,
> -                               target_phys_addr_t *len,
> -                               DMADirection dir)
> +IOMMUTLBEntry spapr_tce_translate(DMAContext *dma, target_phys_addr_t addr,
> +                                  bool is_write)
>  {
>      sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, dma);
> -    enum sPAPRTCEAccess access = (dir == DMA_DIRECTION_FROM_DEVICE)
> -        ? SPAPR_TCE_WO : SPAPR_TCE_RO;
> +    enum sPAPRTCEAccess access = is_write ? SPAPR_TCE_WO : SPAPR_TCE_RO;
>      uint64_t tce;
>
>  #ifdef DEBUG_TCE
> @@ -84,29 +80,27 @@ static int spapr_tce_translate(DMAContext *dma,
>  #ifdef DEBUG_TCE
>          fprintf(stderr, "spapr_tce_translate out of bounds\n");
>  #endif
> -        return -EFAULT;
> +        return (IOMMUTLBEntry) { .valid = false };
>      }
>
>      tce = tcet->table[addr >> SPAPR_TCE_PAGE_SHIFT].tce;
>
>      /* Check TCE */
>      if (!(tce & access)) {
> -        return -EPERM;
> +        return (IOMMUTLBEntry) { .valid = false };
>      }
>
> -    /* How much til end of page ? */
> -    *len = ((~addr) & SPAPR_TCE_PAGE_MASK) + 1;
> -
> -    /* Translate */
> -    *paddr = (tce & ~SPAPR_TCE_PAGE_MASK) |
> -        (addr & SPAPR_TCE_PAGE_MASK);
> -
>  #ifdef DEBUG_TCE
>      fprintf(stderr, " ->  *paddr=0x" TARGET_FMT_plx ", *len=0x"
> -            TARGET_FMT_plx "\n", *paddr, *len);
> +            TARGET_FMT_plx "\n", (tce & ~SPAPR_TCE_PAGE_MASK), SPAPR_TCE_PAGE_MASK + 1);
>  #endif
>
> -    return 0;
> +    return (IOMMUTLBEntry) {
> +        .device_addr = addr & SPAPR_TCE_PAGE_MASK,
> +        .translated_addr = (tce & ~SPAPR_TCE_PAGE_MASK),
> +        .addr_mask = SPAPR_TCE_PAGE_MASK,
> +        .valid = true,
> +    };
>  }
>
>  DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size)
> @@ -118,7 +112,7 @@ DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size)
>      }
>
>      tcet = g_malloc0(sizeof(*tcet));
> -    dma_context_init(&tcet->dma, &address_space_memory, spapr_tce_translate, NULL, NULL);
> +    dma_context_init(&tcet->dma, &address_space_memory, NULL, NULL, NULL);
>
>      tcet->liobn = liobn;
>      tcet->window_size = window_size;
> @@ -253,7 +247,8 @@ int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
>          return 0;
>      }
>
> -    if (iommu->translate == spapr_tce_translate) {
> +    /* FIXME: WHAT?? */
> +    if (true) {
>          sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, iommu);
>          return spapr_dma_dt(fdt, node_off, propname,
>                  tcet->liobn, 0, tcet->window_size);
> diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
> index 661c05b..24f5b46 100644
> --- a/hw/spapr_pci.c
> +++ b/hw/spapr_pci.c
> @@ -506,14 +506,30 @@ static void spapr_msi_write(void *opaque, target_phys_addr_t addr,
>  /*
>   * PHB PCI device
>   */
> -static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
> -                                            int devfn)
> +static MemoryRegion *spapr_pci_dma_iommu_new(PCIBus *bus, void *opaque, int devfn)
>  {
>      sPAPRPHBState *phb = opaque;
>
> -    return phb->dma;
> +    return &phb->iommu;
>  }
>
> +static void spapr_pci_dma_iommu_destroy(MemoryRegion *iommu)
> +{
> +    /* iommu is shared among devices, do nothing */
> +}
> +
> +static IOMMUTLBEntry spapr_phb_translate(MemoryRegion *iommu, target_phys_addr_t addr,
> +                                         bool is_write)
> +{
> +    sPAPRPHBState *phb = container_of(iommu, sPAPRPHBState, iommu);
> +
> +    return spapr_tce_translate(phb->dma, addr, is_write);
> +}
> +
> +static MemoryRegionIOMMUOps spapr_iommu_ops = {
> +    .translate = spapr_phb_translate,
> +};
> +
>  static int spapr_phb_init(SysBusDevice *s)
>  {
>      sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
> @@ -576,7 +592,9 @@ static int spapr_phb_init(SysBusDevice *s)
>      sphb->dma_window_start = 0;
>      sphb->dma_window_size = 0x40000000;
>      sphb->dma = spapr_tce_new_dma_context(sphb->dma_liobn, sphb->dma_window_size);
> -    pci_setup_iommu(bus, spapr_pci_dma_context_fn, sphb);
> +    memory_region_init_iommu(&sphb->iommu, &spapr_iommu_ops, get_system_memory(),
> +                             "iommu-spapr", INT64_MAX);
> +    pci_setup_iommu(bus, spapr_pci_dma_iommu_new, spapr_pci_dma_iommu_destroy, sphb);
>
>      QLIST_INSERT_HEAD(&spapr->phbs, sphb, list);
>
> diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
> index 670dc62..171db45 100644
> --- a/hw/spapr_pci.h
> +++ b/hw/spapr_pci.h
> @@ -45,6 +45,7 @@ typedef struct sPAPRPHBState {
>      target_phys_addr_t mem_win_addr, mem_win_size, io_win_addr, io_win_size;
>      target_phys_addr_t msi_win_addr;
>      MemoryRegion memwindow, iowindow, msiwindow;
> +    MemoryRegion iommu;
>
>      uint32_t dma_liobn;
>      uint64_t dma_window_start;
> --
> 1.7.12
>
Avi Kivity - Oct. 15, 2012, 10:31 a.m.
On 10/13/2012 11:13 AM, Blue Swirl wrote:
>>  struct PCIBus {
>>      BusState qbus;
>> -    PCIDMAContextFunc dma_context_fn;
>> -    void *dma_context_opaque;
>> +    PCIIOMMUFunc iommu_fn;
>> +    PCIIOMMUDestructorFunc iommu_dtor_fn;
>> +    void *iommu_opaque;
> 
> Maybe the opaque could be avoided (in later patches) by clever use of
> container_of() by the functions to get the parent structure?

Indeed yes.  This requires replacing pci_bus_new() and
pci_register_bus() by a conventional pci_bus_init().  I think it's the
right direction to move and the memory API always uses caller-managed
allocation and objects, instead of API-managed allocation and
pointers/handles.

Patch

diff --git a/hw/pci.c b/hw/pci.c
index 7adf61b..02e1989 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -274,6 +274,21 @@  int pci_find_domain(const PCIBus *bus)
     return -1;
 }
 
+static MemoryRegion *pci_default_iommu(PCIBus *bus, void *opaque, int devfn)
+{
+    MemoryRegion *mr = g_new(MemoryRegion, 1);
+
+    /* FIXME: inherit memory region from bus creator */
+    memory_region_init_alias(mr, "iommu-nop", get_system_memory(), 0, INT64_MAX);
+    return mr;
+}
+
+static void pci_default_iommu_dtor(MemoryRegion *mr)
+{
+    memory_region_destroy(mr);
+    g_free(mr);
+}
+
 void pci_bus_new_inplace(PCIBus *bus, DeviceState *parent,
                          const char *name,
                          MemoryRegion *address_space_mem,
@@ -285,6 +300,7 @@  void pci_bus_new_inplace(PCIBus *bus, DeviceState *parent,
     bus->devfn_min = devfn_min;
     bus->address_space_mem = address_space_mem;
     bus->address_space_io = address_space_io;
+    pci_setup_iommu(bus, pci_default_iommu, pci_default_iommu_dtor, NULL);
 
     /* host bridge */
     QLIST_INIT(&bus->child);
@@ -775,21 +791,16 @@  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
                      PCI_SLOT(devfn), PCI_FUNC(devfn), name, bus->devices[devfn]->name);
         return NULL;
     }
+
     pci_dev->bus = bus;
-    if (bus->dma_context_fn) {
-        pci_dev->dma = bus->dma_context_fn(bus, bus->dma_context_opaque, devfn);
-    } else {
-        /* FIXME: Make dma_context_fn use MemoryRegions instead, so this path is
-         * taken unconditionally */
-        /* FIXME: inherit memory region from bus creator */
-        memory_region_init_alias(&pci_dev->bus_master_enable_region, "bus master",
-                                 get_system_memory(), 0,
-                                 memory_region_size(get_system_memory()));
-        memory_region_set_enabled(&pci_dev->bus_master_enable_region, false);
-        address_space_init(&pci_dev->bus_master_as, &pci_dev->bus_master_enable_region);
-        pci_dev->dma = g_new(DMAContext, 1);
-        dma_context_init(pci_dev->dma, &pci_dev->bus_master_as, NULL, NULL, NULL);
-    }
+    pci_dev->iommu = bus->iommu_fn(bus, bus->iommu_opaque, devfn);
+    memory_region_init_alias(&pci_dev->bus_master_enable_region, "bus master",
+                             pci_dev->iommu, 0, memory_region_size(pci_dev->iommu));
+    memory_region_set_enabled(&pci_dev->bus_master_enable_region, false);
+    address_space_init(&pci_dev->bus_master_as, &pci_dev->bus_master_enable_region);
+    pci_dev->dma = g_new(DMAContext, 1);
+    dma_context_init(pci_dev->dma, &pci_dev->bus_master_as, NULL, NULL, NULL);
+
     pci_dev->devfn = devfn;
     pstrcpy(pci_dev->name, sizeof(pci_dev->name), name);
     pci_dev->irq_state = 0;
@@ -843,12 +854,12 @@  static void do_pci_unregister_device(PCIDevice *pci_dev)
     pci_dev->bus->devices[pci_dev->devfn] = NULL;
     pci_config_free(pci_dev);
 
-    if (!pci_dev->bus->dma_context_fn) {
-        address_space_destroy(&pci_dev->bus_master_as);
-        memory_region_destroy(&pci_dev->bus_master_enable_region);
-        g_free(pci_dev->dma);
-        pci_dev->dma = NULL;
-    }
+    address_space_destroy(&pci_dev->bus_master_as);
+    memory_region_del_subregion(&pci_dev->bus_master_enable_region, pci_dev->iommu);
+    pci_dev->bus->iommu_dtor_fn(pci_dev->iommu);
+    memory_region_destroy(&pci_dev->bus_master_enable_region);
+    g_free(pci_dev->dma);
+    pci_dev->dma = NULL;
 }
 
 static void pci_unregister_io_regions(PCIDevice *pci_dev)
@@ -2092,10 +2103,12 @@  static void pci_device_class_init(ObjectClass *klass, void *data)
     k->props = pci_props;
 }
 
-void pci_setup_iommu(PCIBus *bus, PCIDMAContextFunc fn, void *opaque)
+void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, PCIIOMMUDestructorFunc dtor,
+                     void *opaque)
 {
-    bus->dma_context_fn = fn;
-    bus->dma_context_opaque = opaque;
+    bus->iommu_fn = fn;
+    bus->iommu_dtor_fn = dtor;
+    bus->iommu_opaque = opaque;
 }
 
 static TypeInfo pci_device_type_info = {
diff --git a/hw/pci.h b/hw/pci.h
index a65e490..370354a 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -213,6 +213,7 @@  struct PCIDevice {
     PCIIORegion io_regions[PCI_NUM_REGIONS];
     AddressSpace bus_master_as;
     MemoryRegion bus_master_enable_region;
+    MemoryRegion *iommu;
     DMAContext *dma;
 
     /* do not access the following fields */
@@ -351,9 +352,11 @@  int pci_read_devaddr(Monitor *mon, const char *addr, int *domp, int *busp,
 
 void pci_device_deassert_intx(PCIDevice *dev);
 
-typedef DMAContext *(*PCIDMAContextFunc)(PCIBus *, void *, int);
+typedef MemoryRegion *(*PCIIOMMUFunc)(PCIBus *, void *, int);
+typedef void (*PCIIOMMUDestructorFunc)(MemoryRegion *mr);
 
-void pci_setup_iommu(PCIBus *bus, PCIDMAContextFunc fn, void *opaque);
+void pci_setup_iommu(PCIBus *bus, PCIIOMMUFunc fn, PCIIOMMUDestructorFunc dtor,
+                     void *opaque);
 
 static inline void
 pci_set_byte(uint8_t *config, uint8_t val)
diff --git a/hw/pci_internals.h b/hw/pci_internals.h
index c931b64..040508f 100644
--- a/hw/pci_internals.h
+++ b/hw/pci_internals.h
@@ -17,8 +17,9 @@ 
 
 struct PCIBus {
     BusState qbus;
-    PCIDMAContextFunc dma_context_fn;
-    void *dma_context_opaque;
+    PCIIOMMUFunc iommu_fn;
+    PCIIOMMUDestructorFunc iommu_dtor_fn;
+    void *iommu_opaque;
     uint8_t devfn_min;
     pci_set_irq_fn set_irq;
     pci_map_irq_fn map_irq;
diff --git a/hw/spapr.h b/hw/spapr.h
index ac34a17..452efba1 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -334,6 +334,8 @@  typedef struct sPAPRTCE {
 #define SPAPR_PCI_BASE_LIOBN    0x80000000
 
 void spapr_iommu_init(void);
+IOMMUTLBEntry spapr_tce_translate(DMAContext *dma, target_phys_addr_t addr,
+                                  bool is_write);
 DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size);
 void spapr_tce_free(DMAContext *dma);
 int spapr_dma_dt(void *fdt, int node_off, const char *propname,
diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
index 54798a3..79e35d1 100644
--- a/hw/spapr_iommu.c
+++ b/hw/spapr_iommu.c
@@ -63,15 +63,11 @@  static sPAPRTCETable *spapr_tce_find_by_liobn(uint32_t liobn)
     return NULL;
 }
 
-static int spapr_tce_translate(DMAContext *dma,
-                               dma_addr_t addr,
-                               target_phys_addr_t *paddr,
-                               target_phys_addr_t *len,
-                               DMADirection dir)
+IOMMUTLBEntry spapr_tce_translate(DMAContext *dma, target_phys_addr_t addr,
+                                  bool is_write)
 {
     sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, dma);
-    enum sPAPRTCEAccess access = (dir == DMA_DIRECTION_FROM_DEVICE)
-        ? SPAPR_TCE_WO : SPAPR_TCE_RO;
+    enum sPAPRTCEAccess access = is_write ? SPAPR_TCE_WO : SPAPR_TCE_RO;
     uint64_t tce;
 
 #ifdef DEBUG_TCE
@@ -84,29 +80,27 @@  static int spapr_tce_translate(DMAContext *dma,
 #ifdef DEBUG_TCE
         fprintf(stderr, "spapr_tce_translate out of bounds\n");
 #endif
-        return -EFAULT;
+        return (IOMMUTLBEntry) { .valid = false };
     }
 
     tce = tcet->table[addr >> SPAPR_TCE_PAGE_SHIFT].tce;
 
     /* Check TCE */
     if (!(tce & access)) {
-        return -EPERM;
+        return (IOMMUTLBEntry) { .valid = false };
     }
 
-    /* How much til end of page ? */
-    *len = ((~addr) & SPAPR_TCE_PAGE_MASK) + 1;
-
-    /* Translate */
-    *paddr = (tce & ~SPAPR_TCE_PAGE_MASK) |
-        (addr & SPAPR_TCE_PAGE_MASK);
-
 #ifdef DEBUG_TCE
     fprintf(stderr, " ->  *paddr=0x" TARGET_FMT_plx ", *len=0x"
-            TARGET_FMT_plx "\n", *paddr, *len);
+            TARGET_FMT_plx "\n", (tce & ~SPAPR_TCE_PAGE_MASK), SPAPR_TCE_PAGE_MASK + 1);
 #endif
 
-    return 0;
+    return (IOMMUTLBEntry) {
+        .device_addr = addr & SPAPR_TCE_PAGE_MASK,
+        .translated_addr = (tce & ~SPAPR_TCE_PAGE_MASK),
+        .addr_mask = SPAPR_TCE_PAGE_MASK,
+        .valid = true,
+    };
 }
 
 DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size)
@@ -118,7 +112,7 @@  DMAContext *spapr_tce_new_dma_context(uint32_t liobn, size_t window_size)
     }
 
     tcet = g_malloc0(sizeof(*tcet));
-    dma_context_init(&tcet->dma, &address_space_memory, spapr_tce_translate, NULL, NULL);
+    dma_context_init(&tcet->dma, &address_space_memory, NULL, NULL, NULL);
 
     tcet->liobn = liobn;
     tcet->window_size = window_size;
@@ -253,7 +247,8 @@  int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
         return 0;
     }
 
-    if (iommu->translate == spapr_tce_translate) {
+    /* FIXME: WHAT?? */
+    if (true) {
         sPAPRTCETable *tcet = DO_UPCAST(sPAPRTCETable, dma, iommu);
         return spapr_dma_dt(fdt, node_off, propname,
                 tcet->liobn, 0, tcet->window_size);
diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
index 661c05b..24f5b46 100644
--- a/hw/spapr_pci.c
+++ b/hw/spapr_pci.c
@@ -506,14 +506,30 @@  static void spapr_msi_write(void *opaque, target_phys_addr_t addr,
 /*
  * PHB PCI device
  */
-static DMAContext *spapr_pci_dma_context_fn(PCIBus *bus, void *opaque,
-                                            int devfn)
+static MemoryRegion *spapr_pci_dma_iommu_new(PCIBus *bus, void *opaque, int devfn)
 {
     sPAPRPHBState *phb = opaque;
 
-    return phb->dma;
+    return &phb->iommu;
 }
 
+static void spapr_pci_dma_iommu_destroy(MemoryRegion *iommu)
+{
+    /* iommu is shared among devices, do nothing */
+}
+
+static IOMMUTLBEntry spapr_phb_translate(MemoryRegion *iommu, target_phys_addr_t addr,
+                                         bool is_write)
+{
+    sPAPRPHBState *phb = container_of(iommu, sPAPRPHBState, iommu);
+
+    return spapr_tce_translate(phb->dma, addr, is_write);
+}
+
+static MemoryRegionIOMMUOps spapr_iommu_ops = {
+    .translate = spapr_phb_translate,
+};
+
 static int spapr_phb_init(SysBusDevice *s)
 {
     sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
@@ -576,7 +592,9 @@  static int spapr_phb_init(SysBusDevice *s)
     sphb->dma_window_start = 0;
     sphb->dma_window_size = 0x40000000;
     sphb->dma = spapr_tce_new_dma_context(sphb->dma_liobn, sphb->dma_window_size);
-    pci_setup_iommu(bus, spapr_pci_dma_context_fn, sphb);
+    memory_region_init_iommu(&sphb->iommu, &spapr_iommu_ops, get_system_memory(),
+                             "iommu-spapr", INT64_MAX);
+    pci_setup_iommu(bus, spapr_pci_dma_iommu_new, spapr_pci_dma_iommu_destroy, sphb);
 
     QLIST_INSERT_HEAD(&spapr->phbs, sphb, list);
 
diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
index 670dc62..171db45 100644
--- a/hw/spapr_pci.h
+++ b/hw/spapr_pci.h
@@ -45,6 +45,7 @@  typedef struct sPAPRPHBState {
     target_phys_addr_t mem_win_addr, mem_win_size, io_win_addr, io_win_size;
     target_phys_addr_t msi_win_addr;
     MemoryRegion memwindow, iowindow, msiwindow;
+    MemoryRegion iommu;
 
     uint32_t dma_liobn;
     uint64_t dma_window_start;