diff mbox

[2/7] pci: memory access API and IOMMU support

Message ID 1283007298-10942-3-git-send-email-eduard.munteanu@linux360.ro
State New
Headers show

Commit Message

Eduard - Gabriel Munteanu Aug. 28, 2010, 2:54 p.m. UTC
PCI devices should access memory through pci_memory_*() instead of
cpu_physical_memory_*(). This also provides support for translation and
access checking in case an IOMMU is emulated.

Memory maps are treated as remote IOTLBs (that is, translation caches
belonging to the IOMMU-aware device itself). Clients (devices) must
provide callbacks for map invalidation in case these maps are
persistent beyond the current I/O context, e.g. AIO DMA transfers.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
---
 hw/pci.c           |  185 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 hw/pci.h           |   74 +++++++++++++++++++++
 hw/pci_internals.h |   12 ++++
 qemu-common.h      |    1 +
 4 files changed, 271 insertions(+), 1 deletions(-)

Comments

Michael S. Tsirkin Sept. 2, 2010, 5:28 a.m. UTC | #1
On Sat, Aug 28, 2010 at 05:54:53PM +0300, Eduard - Gabriel Munteanu wrote:
> PCI devices should access memory through pci_memory_*() instead of
> cpu_physical_memory_*(). This also provides support for translation and
> access checking in case an IOMMU is emulated.
> 
> Memory maps are treated as remote IOTLBs (that is, translation caches
> belonging to the IOMMU-aware device itself). Clients (devices) must
> provide callbacks for map invalidation in case these maps are
> persistent beyond the current I/O context, e.g. AIO DMA transfers.
> 
> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>


I am concerned about adding more pointer chaising on data path.
Could we have
1. an iommu pointer in a device, inherited by secondary buses
   when they are created and by devices from buses when they are attached.
2. translation pointer in the iommu instead of the bus
3. pci_memory_XX functions inline, doing fast path for non-iommu case:
   
	if (__builtin_expect(!dev->iommu, 1)
		return cpu_memory_rw
	


> ---
>  hw/pci.c           |  185 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  hw/pci.h           |   74 +++++++++++++++++++++
>  hw/pci_internals.h |   12 ++++
>  qemu-common.h      |    1 +
>  4 files changed, 271 insertions(+), 1 deletions(-)

Almost nothing here is PCI specific.
Can this code go into dma.c/dma.h?
We would have struct DMADevice, APIs like device_dma_write etc.
This would help us get rid of the void * stuff as well?

> 
> diff --git a/hw/pci.c b/hw/pci.c
> index 2dc1577..b460905 100644
> --- a/hw/pci.c
> +++ b/hw/pci.c
> @@ -158,6 +158,19 @@ static void pci_device_reset(PCIDevice *dev)
>      pci_update_mappings(dev);
>  }
>  
> +static int pci_no_translate(PCIDevice *iommu,
> +                            PCIDevice *dev,
> +                            pcibus_t addr,
> +                            target_phys_addr_t *paddr,
> +                            target_phys_addr_t *len,
> +                            unsigned perms)
> +{
> +    *paddr = addr;
> +    *len = -1;
> +
> +    return 0;
> +}
> +
>  static void pci_bus_reset(void *opaque)
>  {
>      PCIBus *bus = opaque;
> @@ -220,7 +233,10 @@ void pci_bus_new_inplace(PCIBus *bus, DeviceState *parent,
>  {
>      qbus_create_inplace(&bus->qbus, &pci_bus_info, parent, name);
>      assert(PCI_FUNC(devfn_min) == 0);
> -    bus->devfn_min = devfn_min;
> +
> +    bus->devfn_min  = devfn_min;
> +    bus->iommu      = NULL;
> +    bus->translate  = pci_no_translate;
>  
>      /* host bridge */
>      QLIST_INIT(&bus->child);
> @@ -1789,3 +1805,170 @@ static char *pcibus_get_dev_path(DeviceState *dev)
>      return strdup(path);
>  }
>  
> +void pci_register_iommu(PCIDevice *iommu,
> +                        PCITranslateFunc *translate)
> +{
> +    iommu->bus->iommu = iommu;
> +    iommu->bus->translate = translate;
> +}
> +

The above seems broken for secondary buses, right?  Also, can we use
qdev for initialization in some way, instead of adding more APIs?  E.g.
I think it would be nice if we could just use qdev command line flags to
control which bus is behind iommu and which isn't.


> +void pci_memory_rw(PCIDevice *dev,
> +                   pcibus_t addr,
> +                   uint8_t *buf,
> +                   pcibus_t len,
> +                   int is_write)
> +{
> +    int err;
> +    unsigned perms;
> +    PCIDevice *iommu = dev->bus->iommu;
> +    target_phys_addr_t paddr, plen;
> +
> +    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
> +
> +    while (len) {
> +        err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
> +        if (err)
> +            return;
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len)
> +            plen = len;
> +
> +        cpu_physical_memory_rw(paddr, buf, plen, is_write);
> +
> +        len -= plen;
> +        addr += plen;
> +        buf += plen;
> +    }
> +}
> +
> +static void pci_memory_register_map(PCIDevice *dev,
> +                                    pcibus_t addr,
> +                                    pcibus_t len,
> +                                    target_phys_addr_t paddr,
> +                                    PCIInvalidateMapFunc *invalidate,
> +                                    void *invalidate_opaque)
> +{
> +    PCIMemoryMap *map;
> +
> +    map = qemu_malloc(sizeof(PCIMemoryMap));
> +    map->addr               = addr;
> +    map->len                = len;
> +    map->paddr              = paddr;
> +    map->invalidate         = invalidate;
> +    map->invalidate_opaque  = invalidate_opaque;
> +
> +    QLIST_INSERT_HEAD(&dev->memory_maps, map, list);
> +}
> +
> +static void pci_memory_unregister_map(PCIDevice *dev,
> +                                      target_phys_addr_t paddr,
> +                                      target_phys_addr_t len)
> +{
> +    PCIMemoryMap *map;
> +
> +    QLIST_FOREACH(map, &dev->memory_maps, list) {
> +        if (map->paddr == paddr && map->len == len) {
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +
> +void pci_memory_invalidate_range(PCIDevice *dev,
> +                                 pcibus_t addr,
> +                                 pcibus_t len)
> +{
> +    PCIMemoryMap *map;
> +
> +    QLIST_FOREACH(map, &dev->memory_maps, list) {
> +        if (ranges_overlap(addr, len, map->addr, map->len)) {
> +            map->invalidate(map->invalidate_opaque);
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +
> +void *pci_memory_map(PCIDevice *dev,
> +                     PCIInvalidateMapFunc *cb,
> +                     void *opaque,
> +                     pcibus_t addr,
> +                     target_phys_addr_t *len,
> +                     int is_write)
> +{
> +    int err;
> +    unsigned perms;
> +    PCIDevice *iommu = dev->bus->iommu;
> +    target_phys_addr_t paddr, plen;
> +
> +    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
> +
> +    plen = *len;
> +    err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
> +    if (err)
> +        return NULL;
> +
> +    /*
> +     * If this is true, the virtual region is contiguous,
> +     * but the translated physical region isn't. We just
> +     * clamp *len, much like cpu_physical_memory_map() does.
> +     */
> +    if (plen < *len)
> +        *len = plen;
> +
> +    /* We treat maps as remote TLBs to cope with stuff like AIO. */
> +    if (cb)
> +        pci_memory_register_map(dev, addr, *len, paddr, cb, opaque);
> +
> +    return cpu_physical_memory_map(paddr, len, is_write);
> +}
> +

All the above is really only useful for when there is an iommu,
right? So maybe we should shortcut all this if there's no iommu?

> +void pci_memory_unmap(PCIDevice *dev,
> +                      void *buffer,
> +                      target_phys_addr_t len,
> +                      int is_write,
> +                      target_phys_addr_t access_len)
> +{
> +    cpu_physical_memory_unmap(buffer, len, is_write, access_len);
> +    pci_memory_unregister_map(dev, (target_phys_addr_t) buffer, len);
> +}
> +
> +#define DEFINE_PCI_LD(suffix, size)                                       \
> +uint##size##_t pci_ld##suffix(PCIDevice *dev, pcibus_t addr)              \
> +{                                                                         \
> +    int err;                                                              \
> +    target_phys_addr_t paddr, plen;                                       \
> +                                                                          \
> +    err = dev->bus->translate(dev->bus->iommu, dev,                       \
> +                              addr, &paddr, &plen, IOMMU_PERM_READ);      \
> +    if (err || (plen < size / 8))                                         \
> +        return 0;                                                         \
> +                                                                          \
> +    return ld##suffix##_phys(paddr);                                      \
> +}
> +
> +#define DEFINE_PCI_ST(suffix, size)                                       \
> +void pci_st##suffix(PCIDevice *dev, pcibus_t addr, uint##size##_t val)    \
> +{                                                                         \
> +    int err;                                                              \
> +    target_phys_addr_t paddr, plen;                                       \
> +                                                                          \
> +    err = dev->bus->translate(dev->bus->iommu, dev,                       \
> +                              addr, &paddr, &plen, IOMMU_PERM_WRITE);     \
> +    if (err || (plen < size / 8))                                         \
> +        return;                                                           \
> +                                                                          \
> +    st##suffix##_phys(paddr, val);                                        \
> +}
> +
> +DEFINE_PCI_LD(ub, 8)
> +DEFINE_PCI_LD(uw, 16)
> +DEFINE_PCI_LD(l, 32)
> +DEFINE_PCI_LD(q, 64)
> +
> +DEFINE_PCI_ST(b, 8)
> +DEFINE_PCI_ST(w, 16)
> +DEFINE_PCI_ST(l, 32)
> +DEFINE_PCI_ST(q, 64)
> +
> diff --git a/hw/pci.h b/hw/pci.h
> index c551f96..3131016 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -172,6 +172,8 @@ struct PCIDevice {
>      char *romfile;
>      ram_addr_t rom_offset;
>      uint32_t rom_bar;
> +
> +    QLIST_HEAD(, PCIMemoryMap) memory_maps;
>  };
>  
>  PCIDevice *pci_register_device(PCIBus *bus, const char *name,
> @@ -391,4 +393,76 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1,
>      return !(last2 < first1 || last1 < first2);
>  }
>  
> +/*
> + * Memory I/O and PCI IOMMU definitions.
> + */
> +
> +#define IOMMU_PERM_READ     (1 << 0)
> +#define IOMMU_PERM_WRITE    (1 << 1)
> +#define IOMMU_PERM_RW       (IOMMU_PERM_READ | IOMMU_PERM_WRITE)
> +
> +typedef int PCIInvalidateMapFunc(void *opaque);
> +typedef int PCITranslateFunc(PCIDevice *iommu,
> +                             PCIDevice *dev,
> +                             pcibus_t addr,
> +                             target_phys_addr_t *paddr,
> +                             target_phys_addr_t *len,
> +                             unsigned perms);
> +
> +extern void pci_memory_rw(PCIDevice *dev,
> +                          pcibus_t addr,
> +                          uint8_t *buf,
> +                          pcibus_t len,
> +                          int is_write);
> +extern void *pci_memory_map(PCIDevice *dev,
> +                            PCIInvalidateMapFunc *cb,
> +                            void *opaque,
> +                            pcibus_t addr,
> +                            target_phys_addr_t *len,
> +                            int is_write);
> +extern void pci_memory_unmap(PCIDevice *dev,
> +                             void *buffer,
> +                             target_phys_addr_t len,
> +                             int is_write,
> +                             target_phys_addr_t access_len);
> +extern void pci_register_iommu(PCIDevice *dev,
> +                               PCITranslateFunc *translate);
> +extern void pci_memory_invalidate_range(PCIDevice *dev,
> +                                        pcibus_t addr,
> +                                        pcibus_t len);
> +
> +#define DECLARE_PCI_LD(suffix, size)                                    \
> +extern uint##size##_t pci_ld##suffix(PCIDevice *dev, pcibus_t addr);
> +
> +#define DECLARE_PCI_ST(suffix, size)                                    \
> +extern void pci_st##suffix(PCIDevice *dev,                              \
> +                           pcibus_t addr,                               \
> +                           uint##size##_t val);
> +
> +DECLARE_PCI_LD(ub, 8)
> +DECLARE_PCI_LD(uw, 16)
> +DECLARE_PCI_LD(l, 32)
> +DECLARE_PCI_LD(q, 64)
> +
> +DECLARE_PCI_ST(b, 8)
> +DECLARE_PCI_ST(w, 16)
> +DECLARE_PCI_ST(l, 32)
> +DECLARE_PCI_ST(q, 64)
> +
> +static inline void pci_memory_read(PCIDevice *dev,
> +                                   pcibus_t addr,
> +                                   uint8_t *buf,
> +                                   pcibus_t len)
> +{
> +    pci_memory_rw(dev, addr, buf, len, 0);
> +}
> +
> +static inline void pci_memory_write(PCIDevice *dev,
> +                                    pcibus_t addr,
> +                                    const uint8_t *buf,
> +                                    pcibus_t len)
> +{
> +    pci_memory_rw(dev, addr, (uint8_t *) buf, len, 1);
> +}
> +
>  #endif
> diff --git a/hw/pci_internals.h b/hw/pci_internals.h
> index e3c93a3..fb134b9 100644
> --- a/hw/pci_internals.h
> +++ b/hw/pci_internals.h
> @@ -33,6 +33,9 @@ struct PCIBus {
>         Keep a count of the number of devices with raised IRQs.  */
>      int nirq;
>      int *irq_count;
> +
> +    PCIDevice                       *iommu;
> +    PCITranslateFunc                *translate;
>  };

Why is translate pointer in a bus? I think it's a work of an iommu?

>  struct PCIBridge {
> @@ -44,4 +47,13 @@ struct PCIBridge {
>      const char *bus_name;
>  };
>  
> +struct PCIMemoryMap {
> +    pcibus_t                        addr;
> +    pcibus_t                        len;
> +    target_phys_addr_t              paddr;
> +    PCIInvalidateMapFunc            *invalidate;
> +    void                            *invalidate_opaque;

Can we have a structure that encapsulates the mapping
data instead of a void *?


> +    QLIST_ENTRY(PCIMemoryMap)       list;
> +};
> +
>  #endif /* QEMU_PCI_INTERNALS_H */
> diff --git a/qemu-common.h b/qemu-common.h
> index d735235..8b060e8 100644
> --- a/qemu-common.h
> +++ b/qemu-common.h
> @@ -218,6 +218,7 @@ typedef struct SMBusDevice SMBusDevice;
>  typedef struct PCIHostState PCIHostState;
>  typedef struct PCIExpressHost PCIExpressHost;
>  typedef struct PCIBus PCIBus;
> +typedef struct PCIMemoryMap PCIMemoryMap;
>  typedef struct PCIDevice PCIDevice;
>  typedef struct PCIBridge PCIBridge;
>  typedef struct SerialState SerialState;
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eduard - Gabriel Munteanu Sept. 2, 2010, 8:40 a.m. UTC | #2
On Thu, Sep 02, 2010 at 08:28:26AM +0300, Michael S. Tsirkin wrote:
> On Sat, Aug 28, 2010 at 05:54:53PM +0300, Eduard - Gabriel Munteanu wrote:
> > PCI devices should access memory through pci_memory_*() instead of
> > cpu_physical_memory_*(). This also provides support for translation and
> > access checking in case an IOMMU is emulated.
> > 
> > Memory maps are treated as remote IOTLBs (that is, translation caches
> > belonging to the IOMMU-aware device itself). Clients (devices) must
> > provide callbacks for map invalidation in case these maps are
> > persistent beyond the current I/O context, e.g. AIO DMA transfers.
> > 
> > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> 
> 
> I am concerned about adding more pointer chaising on data path.
> Could we have
> 1. an iommu pointer in a device, inherited by secondary buses
>    when they are created and by devices from buses when they are attached.
> 2. translation pointer in the iommu instead of the bus

The first solution I proposed was based on qdev, that is, each
DeviceState had an 'iommu' field. Translation would be done by
recursively looking in the parent bus/devs for an IOMMU.

But Anthony said we're better off with bus-specific APIs, mostly because
(IIRC) there may be different types of addresses and it might be
difficult to abstract those properly.

I suppose I could revisit the idea by integrating the IOMMU in a
PCIDevice as opposed to a DeviceState.

Anthony, Paul, any thoughts on this?

> 3. pci_memory_XX functions inline, doing fast path for non-iommu case:
>    
> 	if (__builtin_expect(!dev->iommu, 1)
> 		return cpu_memory_rw

But isn't this some sort of 'if (likely(!dev->iommu))' from the Linux
kernel? If so, it puts the IOMMU-enabled case at disadvantage.

I suppose most emulated systems would have at least some theoretical
reasons to enable the IOMMU, e.g. as a GART replacement (say for 32-bit
PCI devices) or for userspace drivers. So there are reasons to enable
the IOMMU even when you don't have a real host IOMMU and you're not
using nested guests.

> > ---
> >  hw/pci.c           |  185 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  hw/pci.h           |   74 +++++++++++++++++++++
> >  hw/pci_internals.h |   12 ++++
> >  qemu-common.h      |    1 +
> >  4 files changed, 271 insertions(+), 1 deletions(-)
> 
> Almost nothing here is PCI specific.
> Can this code go into dma.c/dma.h?
> We would have struct DMADevice, APIs like device_dma_write etc.
> This would help us get rid of the void * stuff as well?
> 

Yeah, I know, that's similar to what I intended to do at first. Though
I'm not sure that rids us of 'void *' stuff, quite on the contrary from
what I've seen.

Some stuff still needs to stay 'void *' (or an equivalent typedef, but
still an opaque) simply because of the required level of abstraction
that's needed.

[snip]

> > +void pci_register_iommu(PCIDevice *iommu,
> > +                        PCITranslateFunc *translate)
> > +{
> > +    iommu->bus->iommu = iommu;
> > +    iommu->bus->translate = translate;
> > +}
> > +
> 
> The above seems broken for secondary buses, right?  Also, can we use
> qdev for initialization in some way, instead of adding more APIs?  E.g.
> I think it would be nice if we could just use qdev command line flags to
> control which bus is behind iommu and which isn't.
> 
> 

Each bus must have its own IOMMU. The secondary bus should ask the
primary bus instead of going through cpu_physical_memory_*(). If that
isn't the case, it's broken and the secondary bus must be converted to
the new API just like regular devices. I'll have a look at that.

> > +void pci_memory_rw(PCIDevice *dev,
> > +                   pcibus_t addr,
> > +                   uint8_t *buf,
> > +                   pcibus_t len,
> > +                   int is_write)
> > +{
> > +    int err;
> > +    unsigned perms;
> > +    PCIDevice *iommu = dev->bus->iommu;
> > +    target_phys_addr_t paddr, plen;
> > +
> > +    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
> > +
> > +    while (len) {
> > +        err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
> > +        if (err)
> > +            return;
> > +
> > +        /* The translation might be valid for larger regions. */
> > +        if (plen > len)
> > +            plen = len;
> > +
> > +        cpu_physical_memory_rw(paddr, buf, plen, is_write);
> > +
> > +        len -= plen;
> > +        addr += plen;
> > +        buf += plen;
> > +    }
> > +}
> > +
> > +static void pci_memory_register_map(PCIDevice *dev,
> > +                                    pcibus_t addr,
> > +                                    pcibus_t len,
> > +                                    target_phys_addr_t paddr,
> > +                                    PCIInvalidateMapFunc *invalidate,
> > +                                    void *invalidate_opaque)
> > +{
> > +    PCIMemoryMap *map;
> > +
> > +    map = qemu_malloc(sizeof(PCIMemoryMap));
> > +    map->addr               = addr;
> > +    map->len                = len;
> > +    map->paddr              = paddr;
> > +    map->invalidate         = invalidate;
> > +    map->invalidate_opaque  = invalidate_opaque;
> > +
> > +    QLIST_INSERT_HEAD(&dev->memory_maps, map, list);
> > +}
> > +
> > +static void pci_memory_unregister_map(PCIDevice *dev,
> > +                                      target_phys_addr_t paddr,
> > +                                      target_phys_addr_t len)
> > +{
> > +    PCIMemoryMap *map;
> > +
> > +    QLIST_FOREACH(map, &dev->memory_maps, list) {
> > +        if (map->paddr == paddr && map->len == len) {
> > +            QLIST_REMOVE(map, list);
> > +            free(map);
> > +        }
> > +    }
> > +}
> > +
> > +void pci_memory_invalidate_range(PCIDevice *dev,
> > +                                 pcibus_t addr,
> > +                                 pcibus_t len)
> > +{
> > +    PCIMemoryMap *map;
> > +
> > +    QLIST_FOREACH(map, &dev->memory_maps, list) {
> > +        if (ranges_overlap(addr, len, map->addr, map->len)) {
> > +            map->invalidate(map->invalidate_opaque);
> > +            QLIST_REMOVE(map, list);
> > +            free(map);
> > +        }
> > +    }
> > +}
> > +
> > +void *pci_memory_map(PCIDevice *dev,
> > +                     PCIInvalidateMapFunc *cb,
> > +                     void *opaque,
> > +                     pcibus_t addr,
> > +                     target_phys_addr_t *len,
> > +                     int is_write)
> > +{
> > +    int err;
> > +    unsigned perms;
> > +    PCIDevice *iommu = dev->bus->iommu;
> > +    target_phys_addr_t paddr, plen;
> > +
> > +    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
> > +
> > +    plen = *len;
> > +    err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
> > +    if (err)
> > +        return NULL;
> > +
> > +    /*
> > +     * If this is true, the virtual region is contiguous,
> > +     * but the translated physical region isn't. We just
> > +     * clamp *len, much like cpu_physical_memory_map() does.
> > +     */
> > +    if (plen < *len)
> > +        *len = plen;
> > +
> > +    /* We treat maps as remote TLBs to cope with stuff like AIO. */
> > +    if (cb)
> > +        pci_memory_register_map(dev, addr, *len, paddr, cb, opaque);
> > +
> > +    return cpu_physical_memory_map(paddr, len, is_write);
> > +}
> > +
> 
> All the above is really only useful for when there is an iommu,
> right? So maybe we should shortcut all this if there's no iommu?
> 

Some people (e.g. Blue) suggested I shouldn't make the IOMMU emulation a
compile-time option, like I originally did. And I'm not sure any runtime
"optimization" (as in likely()/unlikely()) is justified.

[snip]

> > diff --git a/hw/pci_internals.h b/hw/pci_internals.h
> > index e3c93a3..fb134b9 100644
> > --- a/hw/pci_internals.h
> > +++ b/hw/pci_internals.h
> > @@ -33,6 +33,9 @@ struct PCIBus {
> >         Keep a count of the number of devices with raised IRQs.  */
> >      int nirq;
> >      int *irq_count;
> > +
> > +    PCIDevice                       *iommu;
> > +    PCITranslateFunc                *translate;
> >  };
> 
> Why is translate pointer in a bus? I think it's a work of an iommu?
> 

Anthony and Paul thought it's best to simply as the parent bus for
translation. I somewhat agree to that: devices that aren't IOMMU-aware
simply attempt to do PCI requests to memory and the IOMMU translates
and checks them transparently.

> >  struct PCIBridge {
> > @@ -44,4 +47,13 @@ struct PCIBridge {
> >      const char *bus_name;
> >  };
> >  
> > +struct PCIMemoryMap {
> > +    pcibus_t                        addr;
> > +    pcibus_t                        len;
> > +    target_phys_addr_t              paddr;
> > +    PCIInvalidateMapFunc            *invalidate;
> > +    void                            *invalidate_opaque;
> 
> Can we have a structure that encapsulates the mapping
> data instead of a void *?
> 
> 

Not really. 'invalidate_opaque' belongs to device code. It's meant to be
a handle to easily identify the mapping. For example, DMA code wants to
cancel AIO transfers when the bus requests the map to be invalidated.
It's difficult to look that AIO transfer up using non-opaque data.

> > +    QLIST_ENTRY(PCIMemoryMap)       list;
> > +};
> > +
> >  #endif /* QEMU_PCI_INTERNALS_H */
> > diff --git a/qemu-common.h b/qemu-common.h
> > index d735235..8b060e8 100644
> > --- a/qemu-common.h
> > +++ b/qemu-common.h
> > @@ -218,6 +218,7 @@ typedef struct SMBusDevice SMBusDevice;
> >  typedef struct PCIHostState PCIHostState;
> >  typedef struct PCIExpressHost PCIExpressHost;
> >  typedef struct PCIBus PCIBus;
> > +typedef struct PCIMemoryMap PCIMemoryMap;
> >  typedef struct PCIDevice PCIDevice;
> >  typedef struct PCIBridge PCIBridge;
> >  typedef struct SerialState SerialState;
> > -- 
> > 1.7.1
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Sept. 2, 2010, 9:49 a.m. UTC | #3
On Thu, Sep 02, 2010 at 11:40:58AM +0300, Eduard - Gabriel Munteanu wrote:
> On Thu, Sep 02, 2010 at 08:28:26AM +0300, Michael S. Tsirkin wrote:
> > On Sat, Aug 28, 2010 at 05:54:53PM +0300, Eduard - Gabriel Munteanu wrote:
> > > PCI devices should access memory through pci_memory_*() instead of
> > > cpu_physical_memory_*(). This also provides support for translation and
> > > access checking in case an IOMMU is emulated.
> > > 
> > > Memory maps are treated as remote IOTLBs (that is, translation caches
> > > belonging to the IOMMU-aware device itself). Clients (devices) must
> > > provide callbacks for map invalidation in case these maps are
> > > persistent beyond the current I/O context, e.g. AIO DMA transfers.
> > > 
> > > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> > 
> > 
> > I am concerned about adding more pointer chaising on data path.
> > Could we have
> > 1. an iommu pointer in a device, inherited by secondary buses
> >    when they are created and by devices from buses when they are attached.
> > 2. translation pointer in the iommu instead of the bus
> 
> The first solution I proposed was based on qdev, that is, each
> DeviceState had an 'iommu' field. Translation would be done by
> recursively looking in the parent bus/devs for an IOMMU.
> 
> But Anthony said we're better off with bus-specific APIs, mostly because
> (IIRC) there may be different types of addresses and it might be
> difficult to abstract those properly.

Well we ended up with casting
away types to make pci callbacks fit in ide structure,
and silently assuming that all addresses are in fact 64 bit.
So maybe it's hard to abstract addresses properly, but
it appears we'll have to, to avoid even worse problems.

> I suppose I could revisit the idea by integrating the IOMMU in a
> PCIDevice as opposed to a DeviceState.
> 
> Anthony, Paul, any thoughts on this?

Just to clarify: this is an optimization idea:
instead of a bus walk on each access, do the walk
when device is attached to the bus, and copy the iommu
from the root to the device itself.

This will also make it possible to create
DMADeviceState structure which would have this iommu field,
and we'd use this structure instead of the void pointers all over.


> > 3. pci_memory_XX functions inline, doing fast path for non-iommu case:
> >    
> > 	if (__builtin_expect(!dev->iommu, 1)
> > 		return cpu_memory_rw
> 
> But isn't this some sort of 'if (likely(!dev->iommu))' from the Linux
> kernel? If so, it puts the IOMMU-enabled case at disadvantage.

IOMMU has a ton of indirections anyway.

> I suppose most emulated systems would have at least some theoretical
> reasons to enable the IOMMU, e.g. as a GART replacement (say for 32-bit
> PCI devices) or for userspace drivers.
> So there are reasons to enable
> the IOMMU even when you don't have a real host IOMMU and you're not
> using nested guests.

The time most people enable iommu for all devices in both real and virtualized
systems appears distant, one of the reasons is because it has a lot of overhead.
Let's start with not adding overhead for existing users, makes sense?


> > > ---
> > >  hw/pci.c           |  185 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> > >  hw/pci.h           |   74 +++++++++++++++++++++
> > >  hw/pci_internals.h |   12 ++++
> > >  qemu-common.h      |    1 +
> > >  4 files changed, 271 insertions(+), 1 deletions(-)
> > 
> > Almost nothing here is PCI specific.
> > Can this code go into dma.c/dma.h?
> > We would have struct DMADevice, APIs like device_dma_write etc.
> > This would help us get rid of the void * stuff as well?
> > 
> 
> Yeah, I know, that's similar to what I intended to do at first. Though
> I'm not sure that rids us of 'void *' stuff, quite on the contrary from
> what I've seen.
> 
> Some stuff still needs to stay 'void *' (or an equivalent typedef, but
> still an opaque) simply because of the required level of abstraction
> that's needed.
> 
> [snip]
> 
> > > +void pci_register_iommu(PCIDevice *iommu,
> > > +                        PCITranslateFunc *translate)
> > > +{
> > > +    iommu->bus->iommu = iommu;
> > > +    iommu->bus->translate = translate;
> > > +}
> > > +
> > 
> > The above seems broken for secondary buses, right?  Also, can we use
> > qdev for initialization in some way, instead of adding more APIs?  E.g.
> > I think it would be nice if we could just use qdev command line flags to
> > control which bus is behind iommu and which isn't.
> > 
> > 
> 
> Each bus must have its own IOMMU. The secondary bus should ask the
> primary bus instead of going through cpu_physical_memory_*(). If that
> isn't the case, it's broken and the secondary bus must be converted to
> the new API just like regular devices. I'll have a look at that.
> 
> > > +void pci_memory_rw(PCIDevice *dev,
> > > +                   pcibus_t addr,
> > > +                   uint8_t *buf,
> > > +                   pcibus_t len,
> > > +                   int is_write)
> > > +{
> > > +    int err;
> > > +    unsigned perms;
> > > +    PCIDevice *iommu = dev->bus->iommu;
> > > +    target_phys_addr_t paddr, plen;
> > > +
> > > +    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
> > > +
> > > +    while (len) {
> > > +        err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
> > > +        if (err)
> > > +            return;
> > > +
> > > +        /* The translation might be valid for larger regions. */
> > > +        if (plen > len)
> > > +            plen = len;
> > > +
> > > +        cpu_physical_memory_rw(paddr, buf, plen, is_write);
> > > +
> > > +        len -= plen;
> > > +        addr += plen;
> > > +        buf += plen;
> > > +    }
> > > +}
> > > +
> > > +static void pci_memory_register_map(PCIDevice *dev,
> > > +                                    pcibus_t addr,
> > > +                                    pcibus_t len,
> > > +                                    target_phys_addr_t paddr,
> > > +                                    PCIInvalidateMapFunc *invalidate,
> > > +                                    void *invalidate_opaque)
> > > +{
> > > +    PCIMemoryMap *map;
> > > +
> > > +    map = qemu_malloc(sizeof(PCIMemoryMap));
> > > +    map->addr               = addr;
> > > +    map->len                = len;
> > > +    map->paddr              = paddr;
> > > +    map->invalidate         = invalidate;
> > > +    map->invalidate_opaque  = invalidate_opaque;
> > > +
> > > +    QLIST_INSERT_HEAD(&dev->memory_maps, map, list);
> > > +}
> > > +
> > > +static void pci_memory_unregister_map(PCIDevice *dev,
> > > +                                      target_phys_addr_t paddr,
> > > +                                      target_phys_addr_t len)
> > > +{
> > > +    PCIMemoryMap *map;
> > > +
> > > +    QLIST_FOREACH(map, &dev->memory_maps, list) {
> > > +        if (map->paddr == paddr && map->len == len) {
> > > +            QLIST_REMOVE(map, list);
> > > +            free(map);
> > > +        }
> > > +    }
> > > +}
> > > +
> > > +void pci_memory_invalidate_range(PCIDevice *dev,
> > > +                                 pcibus_t addr,
> > > +                                 pcibus_t len)
> > > +{
> > > +    PCIMemoryMap *map;
> > > +
> > > +    QLIST_FOREACH(map, &dev->memory_maps, list) {
> > > +        if (ranges_overlap(addr, len, map->addr, map->len)) {
> > > +            map->invalidate(map->invalidate_opaque);
> > > +            QLIST_REMOVE(map, list);
> > > +            free(map);
> > > +        }
> > > +    }
> > > +}
> > > +
> > > +void *pci_memory_map(PCIDevice *dev,
> > > +                     PCIInvalidateMapFunc *cb,
> > > +                     void *opaque,
> > > +                     pcibus_t addr,
> > > +                     target_phys_addr_t *len,
> > > +                     int is_write)
> > > +{
> > > +    int err;
> > > +    unsigned perms;
> > > +    PCIDevice *iommu = dev->bus->iommu;
> > > +    target_phys_addr_t paddr, plen;
> > > +
> > > +    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
> > > +
> > > +    plen = *len;
> > > +    err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
> > > +    if (err)
> > > +        return NULL;
> > > +
> > > +    /*
> > > +     * If this is true, the virtual region is contiguous,
> > > +     * but the translated physical region isn't. We just
> > > +     * clamp *len, much like cpu_physical_memory_map() does.
> > > +     */
> > > +    if (plen < *len)
> > > +        *len = plen;
> > > +
> > > +    /* We treat maps as remote TLBs to cope with stuff like AIO. */
> > > +    if (cb)
> > > +        pci_memory_register_map(dev, addr, *len, paddr, cb, opaque);
> > > +
> > > +    return cpu_physical_memory_map(paddr, len, is_write);
> > > +}
> > > +
> > 
> > All the above is really only useful for when there is an iommu,
> > right? So maybe we should shortcut all this if there's no iommu?
> > 
> 
> Some people (e.g. Blue) suggested I shouldn't make the IOMMU emulation a
> compile-time option, like I originally did. And I'm not sure any runtime
> "optimization" (as in likely()/unlikely()) is justified.
> 
> [snip]
> 
> > > diff --git a/hw/pci_internals.h b/hw/pci_internals.h
> > > index e3c93a3..fb134b9 100644
> > > --- a/hw/pci_internals.h
> > > +++ b/hw/pci_internals.h
> > > @@ -33,6 +33,9 @@ struct PCIBus {
> > >         Keep a count of the number of devices with raised IRQs.  */
> > >      int nirq;
> > >      int *irq_count;
> > > +
> > > +    PCIDevice                       *iommu;
> > > +    PCITranslateFunc                *translate;
> > >  };
> > 
> > Why is translate pointer in a bus? I think it's a work of an iommu?
> > 
> 
> Anthony and Paul thought it's best to simply as the parent bus for
> translation. I somewhat agree to that: devices that aren't IOMMU-aware
> simply attempt to do PCI requests to memory and the IOMMU translates
> and checks them transparently.
> 
> > >  struct PCIBridge {
> > > @@ -44,4 +47,13 @@ struct PCIBridge {
> > >      const char *bus_name;
> > >  };
> > >  
> > > +struct PCIMemoryMap {
> > > +    pcibus_t                        addr;
> > > +    pcibus_t                        len;
> > > +    target_phys_addr_t              paddr;
> > > +    PCIInvalidateMapFunc            *invalidate;
> > > +    void                            *invalidate_opaque;
> > 
> > Can we have a structure that encapsulates the mapping
> > data instead of a void *?
> > 
> > 
> 
> Not really. 'invalidate_opaque' belongs to device code. It's meant to be
> a handle to easily identify the mapping. For example, DMA code wants to
> cancel AIO transfers when the bus requests the map to be invalidated.
> It's difficult to look that AIO transfer up using non-opaque data.
> 
> > > +    QLIST_ENTRY(PCIMemoryMap)       list;
> > > +};
> > > +
> > >  #endif /* QEMU_PCI_INTERNALS_H */
> > > diff --git a/qemu-common.h b/qemu-common.h
> > > index d735235..8b060e8 100644
> > > --- a/qemu-common.h
> > > +++ b/qemu-common.h
> > > @@ -218,6 +218,7 @@ typedef struct SMBusDevice SMBusDevice;
> > >  typedef struct PCIHostState PCIHostState;
> > >  typedef struct PCIExpressHost PCIExpressHost;
> > >  typedef struct PCIBus PCIBus;
> > > +typedef struct PCIMemoryMap PCIMemoryMap;
> > >  typedef struct PCIDevice PCIDevice;
> > >  typedef struct PCIBridge PCIBridge;
> > >  typedef struct SerialState SerialState;
> > > -- 
> > > 1.7.1
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
Blue Swirl Sept. 4, 2010, 9:01 a.m. UTC | #4
On Thu, Sep 2, 2010 at 9:49 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Thu, Sep 02, 2010 at 11:40:58AM +0300, Eduard - Gabriel Munteanu wrote:
>> On Thu, Sep 02, 2010 at 08:28:26AM +0300, Michael S. Tsirkin wrote:
>> > On Sat, Aug 28, 2010 at 05:54:53PM +0300, Eduard - Gabriel Munteanu wrote:
>> > > PCI devices should access memory through pci_memory_*() instead of
>> > > cpu_physical_memory_*(). This also provides support for translation and
>> > > access checking in case an IOMMU is emulated.
>> > >
>> > > Memory maps are treated as remote IOTLBs (that is, translation caches
>> > > belonging to the IOMMU-aware device itself). Clients (devices) must
>> > > provide callbacks for map invalidation in case these maps are
>> > > persistent beyond the current I/O context, e.g. AIO DMA transfers.
>> > >
>> > > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
>> >
>> >
>> > I am concerned about adding more pointer chaising on data path.
>> > Could we have
>> > 1. an iommu pointer in a device, inherited by secondary buses
>> >    when they are created and by devices from buses when they are attached.
>> > 2. translation pointer in the iommu instead of the bus
>>
>> The first solution I proposed was based on qdev, that is, each
>> DeviceState had an 'iommu' field. Translation would be done by
>> recursively looking in the parent bus/devs for an IOMMU.
>>
>> But Anthony said we're better off with bus-specific APIs, mostly because
>> (IIRC) there may be different types of addresses and it might be
>> difficult to abstract those properly.
>
> Well we ended up with casting
> away types to make pci callbacks fit in ide structure,
> and silently assuming that all addresses are in fact 64 bit.
> So maybe it's hard to abstract addresses properly, but
> it appears we'll have to, to avoid even worse problems.
>
>> I suppose I could revisit the idea by integrating the IOMMU in a
>> PCIDevice as opposed to a DeviceState.
>>
>> Anthony, Paul, any thoughts on this?
>
> Just to clarify: this is an optimization idea:
> instead of a bus walk on each access, do the walk
> when device is attached to the bus, and copy the iommu
> from the root to the device itself.
>
> This will also make it possible to create
> DMADeviceState structure which would have this iommu field,
> and we'd use this structure instead of the void pointers all over.
>
>
>> > 3. pci_memory_XX functions inline, doing fast path for non-iommu case:
>> >
>> >     if (__builtin_expect(!dev->iommu, 1)
>> >             return cpu_memory_rw
>>
>> But isn't this some sort of 'if (likely(!dev->iommu))' from the Linux
>> kernel? If so, it puts the IOMMU-enabled case at disadvantage.
>
> IOMMU has a ton of indirections anyway.
>
>> I suppose most emulated systems would have at least some theoretical
>> reasons to enable the IOMMU, e.g. as a GART replacement (say for 32-bit
>> PCI devices) or for userspace drivers.
>> So there are reasons to enable
>> the IOMMU even when you don't have a real host IOMMU and you're not
>> using nested guests.
>
> The time most people enable iommu for all devices in both real and virtualized
> systems appears distant, one of the reasons is because it has a lot of overhead.
> Let's start with not adding overhead for existing users, makes sense?

I think the goal architecture (not for IOMMU, but in general) is one
with zero copy DMA. This means we have stage one where the addresses
are translated to host pointers and stage two where the read/write
operation happens. The devices need to be converted.

Now, let's consider the IOMMU in this zero copy architecture. It's one
stage of address translation, for the access operation it will not
matter. We can add translation caching at device level (or even at any
intermediate levels), but that needs a cache invalidation callback
system as discussed earlier. This can be implemented later, we need
the zero copy stuff first.

Given this overall picture, I think eliminating some pointer
dereference overheads in non-zero copy architecture is a very
premature optimization and it may even direct the architecture to
wrong direction.

If the performance degradation at this point is not acceptable, we
could also postpone merging IOMMU until zero copy conversion has
happened, or make IOMMU a compile time option. But it would be nice to
back the decision by performance figures.
Michael S. Tsirkin Sept. 5, 2010, 7:10 a.m. UTC | #5
On Sat, Sep 04, 2010 at 09:01:06AM +0000, Blue Swirl wrote:
> On Thu, Sep 2, 2010 at 9:49 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > On Thu, Sep 02, 2010 at 11:40:58AM +0300, Eduard - Gabriel Munteanu wrote:
> >> On Thu, Sep 02, 2010 at 08:28:26AM +0300, Michael S. Tsirkin wrote:
> >> > On Sat, Aug 28, 2010 at 05:54:53PM +0300, Eduard - Gabriel Munteanu wrote:
> >> > > PCI devices should access memory through pci_memory_*() instead of
> >> > > cpu_physical_memory_*(). This also provides support for translation and
> >> > > access checking in case an IOMMU is emulated.
> >> > >
> >> > > Memory maps are treated as remote IOTLBs (that is, translation caches
> >> > > belonging to the IOMMU-aware device itself). Clients (devices) must
> >> > > provide callbacks for map invalidation in case these maps are
> >> > > persistent beyond the current I/O context, e.g. AIO DMA transfers.
> >> > >
> >> > > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> >> >
> >> >
> >> > I am concerned about adding more pointer chaising on data path.
> >> > Could we have
> >> > 1. an iommu pointer in a device, inherited by secondary buses
> >> >    when they are created and by devices from buses when they are attached.
> >> > 2. translation pointer in the iommu instead of the bus
> >>
> >> The first solution I proposed was based on qdev, that is, each
> >> DeviceState had an 'iommu' field. Translation would be done by
> >> recursively looking in the parent bus/devs for an IOMMU.
> >>
> >> But Anthony said we're better off with bus-specific APIs, mostly because
> >> (IIRC) there may be different types of addresses and it might be
> >> difficult to abstract those properly.
> >
> > Well we ended up with casting
> > away types to make pci callbacks fit in ide structure,
> > and silently assuming that all addresses are in fact 64 bit.
> > So maybe it's hard to abstract addresses properly, but
> > it appears we'll have to, to avoid even worse problems.
> >
> >> I suppose I could revisit the idea by integrating the IOMMU in a
> >> PCIDevice as opposed to a DeviceState.
> >>
> >> Anthony, Paul, any thoughts on this?
> >
> > Just to clarify: this is an optimization idea:
> > instead of a bus walk on each access, do the walk
> > when device is attached to the bus, and copy the iommu
> > from the root to the device itself.
> >
> > This will also make it possible to create
> > DMADeviceState structure which would have this iommu field,
> > and we'd use this structure instead of the void pointers all over.
> >
> >
> >> > 3. pci_memory_XX functions inline, doing fast path for non-iommu case:
> >> >
> >> >     if (__builtin_expect(!dev->iommu, 1)
> >> >             return cpu_memory_rw
> >>
> >> But isn't this some sort of 'if (likely(!dev->iommu))' from the Linux
> >> kernel? If so, it puts the IOMMU-enabled case at disadvantage.
> >
> > IOMMU has a ton of indirections anyway.
> >
> >> I suppose most emulated systems would have at least some theoretical
> >> reasons to enable the IOMMU, e.g. as a GART replacement (say for 32-bit
> >> PCI devices) or for userspace drivers.
> >> So there are reasons to enable
> >> the IOMMU even when you don't have a real host IOMMU and you're not
> >> using nested guests.
> >
> > The time most people enable iommu for all devices in both real and virtualized
> > systems appears distant, one of the reasons is because it has a lot of overhead.
> > Let's start with not adding overhead for existing users, makes sense?
> 
> I think the goal architecture (not for IOMMU, but in general) is one
> with zero copy DMA. This means we have stage one where the addresses
> are translated to host pointers and stage two where the read/write
> operation happens. The devices need to be converted.
> 
> Now, let's consider the IOMMU in this zero copy architecture. It's one
> stage of address translation, for the access operation it will not
> matter. We can add translation caching at device level (or even at any
> intermediate levels), but that needs a cache invalidation callback
> system as discussed earlier. This can be implemented later, we need
> the zero copy stuff first.
> 
> Given this overall picture, I think eliminating some pointer
> dereference overheads in non-zero copy architecture is a very
> premature optimization and it may even direct the architecture to
> wrong direction.
> 
> If the performance degradation at this point is not acceptable, we
> could also postpone merging IOMMU until zero copy conversion has
> happened, or make IOMMU a compile time option. But it would be nice to
> back the decision by performance figures.

I agree, a minimal benchmark showing no performance impact
when disabled would put these concerns to rest.
diff mbox

Patch

diff --git a/hw/pci.c b/hw/pci.c
index 2dc1577..b460905 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -158,6 +158,19 @@  static void pci_device_reset(PCIDevice *dev)
     pci_update_mappings(dev);
 }
 
+static int pci_no_translate(PCIDevice *iommu,
+                            PCIDevice *dev,
+                            pcibus_t addr,
+                            target_phys_addr_t *paddr,
+                            target_phys_addr_t *len,
+                            unsigned perms)
+{
+    *paddr = addr;
+    *len = -1;
+
+    return 0;
+}
+
 static void pci_bus_reset(void *opaque)
 {
     PCIBus *bus = opaque;
@@ -220,7 +233,10 @@  void pci_bus_new_inplace(PCIBus *bus, DeviceState *parent,
 {
     qbus_create_inplace(&bus->qbus, &pci_bus_info, parent, name);
     assert(PCI_FUNC(devfn_min) == 0);
-    bus->devfn_min = devfn_min;
+
+    bus->devfn_min  = devfn_min;
+    bus->iommu      = NULL;
+    bus->translate  = pci_no_translate;
 
     /* host bridge */
     QLIST_INIT(&bus->child);
@@ -1789,3 +1805,170 @@  static char *pcibus_get_dev_path(DeviceState *dev)
     return strdup(path);
 }
 
+void pci_register_iommu(PCIDevice *iommu,
+                        PCITranslateFunc *translate)
+{
+    iommu->bus->iommu = iommu;
+    iommu->bus->translate = translate;
+}
+
+void pci_memory_rw(PCIDevice *dev,
+                   pcibus_t addr,
+                   uint8_t *buf,
+                   pcibus_t len,
+                   int is_write)
+{
+    int err;
+    unsigned perms;
+    PCIDevice *iommu = dev->bus->iommu;
+    target_phys_addr_t paddr, plen;
+
+    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
+
+    while (len) {
+        err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
+        if (err)
+            return;
+
+        /* The translation might be valid for larger regions. */
+        if (plen > len)
+            plen = len;
+
+        cpu_physical_memory_rw(paddr, buf, plen, is_write);
+
+        len -= plen;
+        addr += plen;
+        buf += plen;
+    }
+}
+
+static void pci_memory_register_map(PCIDevice *dev,
+                                    pcibus_t addr,
+                                    pcibus_t len,
+                                    target_phys_addr_t paddr,
+                                    PCIInvalidateMapFunc *invalidate,
+                                    void *invalidate_opaque)
+{
+    PCIMemoryMap *map;
+
+    map = qemu_malloc(sizeof(PCIMemoryMap));
+    map->addr               = addr;
+    map->len                = len;
+    map->paddr              = paddr;
+    map->invalidate         = invalidate;
+    map->invalidate_opaque  = invalidate_opaque;
+
+    QLIST_INSERT_HEAD(&dev->memory_maps, map, list);
+}
+
+static void pci_memory_unregister_map(PCIDevice *dev,
+                                      target_phys_addr_t paddr,
+                                      target_phys_addr_t len)
+{
+    PCIMemoryMap *map;
+
+    QLIST_FOREACH(map, &dev->memory_maps, list) {
+        if (map->paddr == paddr && map->len == len) {
+            QLIST_REMOVE(map, list);
+            free(map);
+        }
+    }
+}
+
+void pci_memory_invalidate_range(PCIDevice *dev,
+                                 pcibus_t addr,
+                                 pcibus_t len)
+{
+    PCIMemoryMap *map;
+
+    QLIST_FOREACH(map, &dev->memory_maps, list) {
+        if (ranges_overlap(addr, len, map->addr, map->len)) {
+            map->invalidate(map->invalidate_opaque);
+            QLIST_REMOVE(map, list);
+            free(map);
+        }
+    }
+}
+
+void *pci_memory_map(PCIDevice *dev,
+                     PCIInvalidateMapFunc *cb,
+                     void *opaque,
+                     pcibus_t addr,
+                     target_phys_addr_t *len,
+                     int is_write)
+{
+    int err;
+    unsigned perms;
+    PCIDevice *iommu = dev->bus->iommu;
+    target_phys_addr_t paddr, plen;
+
+    perms = is_write ? IOMMU_PERM_WRITE : IOMMU_PERM_READ;
+
+    plen = *len;
+    err = dev->bus->translate(iommu, dev, addr, &paddr, &plen, perms);
+    if (err)
+        return NULL;
+
+    /*
+     * If this is true, the virtual region is contiguous,
+     * but the translated physical region isn't. We just
+     * clamp *len, much like cpu_physical_memory_map() does.
+     */
+    if (plen < *len)
+        *len = plen;
+
+    /* We treat maps as remote TLBs to cope with stuff like AIO. */
+    if (cb)
+        pci_memory_register_map(dev, addr, *len, paddr, cb, opaque);
+
+    return cpu_physical_memory_map(paddr, len, is_write);
+}
+
+void pci_memory_unmap(PCIDevice *dev,
+                      void *buffer,
+                      target_phys_addr_t len,
+                      int is_write,
+                      target_phys_addr_t access_len)
+{
+    cpu_physical_memory_unmap(buffer, len, is_write, access_len);
+    pci_memory_unregister_map(dev, (target_phys_addr_t) buffer, len);
+}
+
+#define DEFINE_PCI_LD(suffix, size)                                       \
+uint##size##_t pci_ld##suffix(PCIDevice *dev, pcibus_t addr)              \
+{                                                                         \
+    int err;                                                              \
+    target_phys_addr_t paddr, plen;                                       \
+                                                                          \
+    err = dev->bus->translate(dev->bus->iommu, dev,                       \
+                              addr, &paddr, &plen, IOMMU_PERM_READ);      \
+    if (err || (plen < size / 8))                                         \
+        return 0;                                                         \
+                                                                          \
+    return ld##suffix##_phys(paddr);                                      \
+}
+
+#define DEFINE_PCI_ST(suffix, size)                                       \
+void pci_st##suffix(PCIDevice *dev, pcibus_t addr, uint##size##_t val)    \
+{                                                                         \
+    int err;                                                              \
+    target_phys_addr_t paddr, plen;                                       \
+                                                                          \
+    err = dev->bus->translate(dev->bus->iommu, dev,                       \
+                              addr, &paddr, &plen, IOMMU_PERM_WRITE);     \
+    if (err || (plen < size / 8))                                         \
+        return;                                                           \
+                                                                          \
+    st##suffix##_phys(paddr, val);                                        \
+}
+
+DEFINE_PCI_LD(ub, 8)
+DEFINE_PCI_LD(uw, 16)
+DEFINE_PCI_LD(l, 32)
+DEFINE_PCI_LD(q, 64)
+
+DEFINE_PCI_ST(b, 8)
+DEFINE_PCI_ST(w, 16)
+DEFINE_PCI_ST(l, 32)
+DEFINE_PCI_ST(q, 64)
+
diff --git a/hw/pci.h b/hw/pci.h
index c551f96..3131016 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -172,6 +172,8 @@  struct PCIDevice {
     char *romfile;
     ram_addr_t rom_offset;
     uint32_t rom_bar;
+
+    QLIST_HEAD(, PCIMemoryMap) memory_maps;
 };
 
 PCIDevice *pci_register_device(PCIBus *bus, const char *name,
@@ -391,4 +393,76 @@  static inline int ranges_overlap(uint64_t first1, uint64_t len1,
     return !(last2 < first1 || last1 < first2);
 }
 
+/*
+ * Memory I/O and PCI IOMMU definitions.
+ */
+
+#define IOMMU_PERM_READ     (1 << 0)
+#define IOMMU_PERM_WRITE    (1 << 1)
+#define IOMMU_PERM_RW       (IOMMU_PERM_READ | IOMMU_PERM_WRITE)
+
+typedef int PCIInvalidateMapFunc(void *opaque);
+typedef int PCITranslateFunc(PCIDevice *iommu,
+                             PCIDevice *dev,
+                             pcibus_t addr,
+                             target_phys_addr_t *paddr,
+                             target_phys_addr_t *len,
+                             unsigned perms);
+
+extern void pci_memory_rw(PCIDevice *dev,
+                          pcibus_t addr,
+                          uint8_t *buf,
+                          pcibus_t len,
+                          int is_write);
+extern void *pci_memory_map(PCIDevice *dev,
+                            PCIInvalidateMapFunc *cb,
+                            void *opaque,
+                            pcibus_t addr,
+                            target_phys_addr_t *len,
+                            int is_write);
+extern void pci_memory_unmap(PCIDevice *dev,
+                             void *buffer,
+                             target_phys_addr_t len,
+                             int is_write,
+                             target_phys_addr_t access_len);
+extern void pci_register_iommu(PCIDevice *dev,
+                               PCITranslateFunc *translate);
+extern void pci_memory_invalidate_range(PCIDevice *dev,
+                                        pcibus_t addr,
+                                        pcibus_t len);
+
+#define DECLARE_PCI_LD(suffix, size)                                    \
+extern uint##size##_t pci_ld##suffix(PCIDevice *dev, pcibus_t addr);
+
+#define DECLARE_PCI_ST(suffix, size)                                    \
+extern void pci_st##suffix(PCIDevice *dev,                              \
+                           pcibus_t addr,                               \
+                           uint##size##_t val);
+
+DECLARE_PCI_LD(ub, 8)
+DECLARE_PCI_LD(uw, 16)
+DECLARE_PCI_LD(l, 32)
+DECLARE_PCI_LD(q, 64)
+
+DECLARE_PCI_ST(b, 8)
+DECLARE_PCI_ST(w, 16)
+DECLARE_PCI_ST(l, 32)
+DECLARE_PCI_ST(q, 64)
+
+static inline void pci_memory_read(PCIDevice *dev,
+                                   pcibus_t addr,
+                                   uint8_t *buf,
+                                   pcibus_t len)
+{
+    pci_memory_rw(dev, addr, buf, len, 0);
+}
+
+static inline void pci_memory_write(PCIDevice *dev,
+                                    pcibus_t addr,
+                                    const uint8_t *buf,
+                                    pcibus_t len)
+{
+    pci_memory_rw(dev, addr, (uint8_t *) buf, len, 1);
+}
+
 #endif
diff --git a/hw/pci_internals.h b/hw/pci_internals.h
index e3c93a3..fb134b9 100644
--- a/hw/pci_internals.h
+++ b/hw/pci_internals.h
@@ -33,6 +33,9 @@  struct PCIBus {
        Keep a count of the number of devices with raised IRQs.  */
     int nirq;
     int *irq_count;
+
+    PCIDevice                       *iommu;
+    PCITranslateFunc                *translate;
 };
 
 struct PCIBridge {
@@ -44,4 +47,13 @@  struct PCIBridge {
     const char *bus_name;
 };
 
+struct PCIMemoryMap {
+    pcibus_t                        addr;
+    pcibus_t                        len;
+    target_phys_addr_t              paddr;
+    PCIInvalidateMapFunc            *invalidate;
+    void                            *invalidate_opaque;
+    QLIST_ENTRY(PCIMemoryMap)       list;
+};
+
 #endif /* QEMU_PCI_INTERNALS_H */
diff --git a/qemu-common.h b/qemu-common.h
index d735235..8b060e8 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -218,6 +218,7 @@  typedef struct SMBusDevice SMBusDevice;
 typedef struct PCIHostState PCIHostState;
 typedef struct PCIExpressHost PCIExpressHost;
 typedef struct PCIBus PCIBus;
+typedef struct PCIMemoryMap PCIMemoryMap;
 typedef struct PCIDevice PCIDevice;
 typedef struct PCIBridge PCIBridge;
 typedef struct SerialState SerialState;