Patchwork [10/13] iommu: Introduce IOMMU emulation infrastructure

login
register
mail settings
Submitter David Gibson
Date March 1, 2012, 5:36 a.m.
Message ID <1330580169-15446-11-git-send-email-david@gibson.dropbear.id.au>
Download mbox | patch
Permalink /patch/143903/
State New
Headers show

Comments

David Gibson - March 1, 2012, 5:36 a.m.
This patch adds the basic infrastructure necessary to emulate an IOMMU
visible to the guest.  The DMAContext structure is extended with
information and a callback describing the translation, and the various
DMA functions used by devices will now perform IOMMU translation using
this callback.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Richard Henderson <rth@twiddle.net>

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 configure     |   12 ++++
 dma-helpers.c |  189 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 dma.h         |  126 +++++++++++++++++++++++++++++++-------
 3 files changed, 305 insertions(+), 22 deletions(-)
Michael S. Tsirkin - March 1, 2012, 3:49 p.m.
On Thu, Mar 01, 2012 at 04:36:06PM +1100, David Gibson wrote:
> This patch adds the basic infrastructure necessary to emulate an IOMMU
> visible to the guest.  The DMAContext structure is extended with
> information and a callback describing the translation, and the various
> DMA functions used by devices will now perform IOMMU translation using
> this callback.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Richard Henderson <rth@twiddle.net>
> 
> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  configure     |   12 ++++
>  dma-helpers.c |  189 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Why not just dma.c?

>  dma.h         |  126 +++++++++++++++++++++++++++++++-------
>  3 files changed, 305 insertions(+), 22 deletions(-)
> 
> diff --git a/configure b/configure
> index fb0e18e..3eff89c 100755
> --- a/configure
> +++ b/configure
> @@ -138,6 +138,7 @@ linux_aio=""
>  cap_ng=""
>  attr=""
>  libattr=""
> +iommu="yes"
>  xfs=""
>  
>  vhost_net="no"
> @@ -784,6 +785,10 @@ for opt do
>    ;;
>    --enable-vhost-net) vhost_net="yes"
>    ;;
> +  --enable-iommu) iommu="yes"
> +  ;;
> +  --disable-iommu) iommu="no"
> +  ;;
>    --disable-opengl) opengl="no"
>    ;;
>    --enable-opengl) opengl="yes"
> @@ -1085,6 +1090,8 @@ echo "  --enable-docs            enable documentation build"
>  echo "  --disable-docs           disable documentation build"
>  echo "  --disable-vhost-net      disable vhost-net acceleration support"
>  echo "  --enable-vhost-net       enable vhost-net acceleration support"
> +echo "  --disable-iommu          disable IOMMU emulation support"
> +echo "  --enable-iommu           enable IOMMU emulation support"
>  echo "  --enable-trace-backend=B Set trace backend"
>  echo "                           Available backends:" $("$source_path"/scripts/tracetool --list-backends)
>  echo "  --with-trace-file=NAME   Full PATH,NAME of file to store traces"
> @@ -2935,6 +2942,7 @@ echo "posix_madvise     $posix_madvise"
>  echo "uuid support      $uuid"
>  echo "libcap-ng support $cap_ng"
>  echo "vhost-net support $vhost_net"
> +echo "IOMMU support     $iommu"
>  echo "Trace backend     $trace_backend"
>  echo "Trace output file $trace_file-<pid>"
>  echo "spice support     $spice"
> @@ -3812,6 +3820,10 @@ if test "$target_softmmu" = "yes" -a \( \
>    echo "CONFIG_NEED_MMU=y" >> $config_target_mak
>  fi
>  
> +if test "$iommu" = "yes" ; then
> +  echo "CONFIG_IOMMU=y" >> $config_host_mak
> +fi
> +
>  if test "$gprof" = "yes" ; then
>    echo "TARGET_GPROF=yes" >> $config_target_mak
>    if test "$target_linux_user" = "yes" ; then
> diff --git a/dma-helpers.c b/dma-helpers.c
> index 9dcfb2c..8269d60 100644
> --- a/dma-helpers.c
> +++ b/dma-helpers.c
> @@ -10,6 +10,9 @@
>  #include "dma.h"
>  #include "block_int.h"
>  #include "trace.h"
> +#include "range.h"
> +
> +/*#define DEBUG_IOMMU*/
>  
>  void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma)
>  {
> @@ -255,3 +258,189 @@ void dma_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie,
>  {
>      bdrv_acct_start(bs, cookie, sg->size, type);
>  }
> +
> +#ifdef CONFIG_IOMMU
> +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> +                        DMADirection dir)
> +{
> +    target_phys_addr_t paddr, plen;
> +
> +#ifdef DEBUG_DMA
> +    fprintf(stderr, "dma_memory_check iommu=%p addr=0x%llx len=0x%llx dir=%d\n",
> +            (unsigned long long)addr, (unsigned long long)len, dir);
> +#endif
> +
> +    while (len) {
> +        if (dma->translate(dma, addr, &paddr, &plen, dir) != 0) {
> +            return false;
> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        len -= plen;
> +        addr += plen;
> +    }
> +
> +    return true;
> +}
> +
> +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> +                    void *buf, dma_addr_t len, DMADirection dir)

By the way, users still ignore the return code
from the pci routines.
I suspect the API of returning errors is just not
such a good one. What happens on read errors with
real devices? Master abort? What about write errors?
Maybe we need a callback
that will set error flags in the device instead.

> +{
> +    target_phys_addr_t paddr, plen;
> +    int err;
> +
> +#ifdef DEBUG_IOMMU
> +    fprintf(stderr, "dma_memory_rw iommu=%p addr=0x%llx len=0x%llx dir=%d\n",
> +            (unsigned long long)addr, (unsigned long long)len, dir);
> +#endif
> +
> +    while (len) {
> +        err = dma->translate(dma, addr, &paddr, &plen, dir);
> +        if (err) {
> +            return -1;

Why not return err?

> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        cpu_physical_memory_rw(paddr, buf, plen,
> +                               dir == DMA_DIRECTION_FROM_DEVICE);
> +
> +        len -= plen;
> +        addr += plen;
> +        buf += plen;
> +    }
> +
> +    return 0;
> +}
> +
> +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len)
> +{
> +    target_phys_addr_t paddr, plen;
> +    int err;
> +
> +#ifdef DEBUG_IOMMU
> +    fprintf(stderr, "dma_memory_zero iommu=%p addr=0x%llx len=0x%llx\n",
> +            (unsigned long long)addr, (unsigned long long)len);
> +#endif
> +
> +    while (len) {
> +        err = dma->translate(dma, addr, &paddr, &plen,
> +                             DMA_DIRECTION_FROM_DEVICE);
> +        if (err) {
> +            return -1;
> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        cpu_physical_memory_zero(paddr, plen);
> +
> +        len -= plen;
> +        addr += plen;
> +    }
> +
> +    return 0;
> +}
> +
> +typedef struct DMAMemoryMap DMAMemoryMap;
> +struct DMAMemoryMap {
> +    dma_addr_t              addr;
> +    size_t                  len;
> +    void                    *buf;
> +    DMAInvalidateMapFunc    *invalidate;
> +    void                    *invalidate_opaque;

Do we absolutely need the opaque value?
Can we let the user allocate the map objects?
Then user can use safe container_of to get user data.

> +
> +    QLIST_ENTRY(DMAMemoryMap) list;
> +};
> +
> +void dma_context_init(DMAContext *dma, DMATranslateFunc fn, void *opaque)
> +{
> +    dma->translate = fn;
> +    dma->opaque = opaque;
> +    QLIST_INIT(&dma->memory_maps);
> +}
> +
> +void dma_invalidate_memory_range(DMAContext *dma,
> +                                 dma_addr_t addr, dma_addr_t len)
> +{
> +    DMAMemoryMap *map;
> +
> +    QLIST_FOREACH(map, &dma->memory_maps, list) {
> +        if (ranges_overlap(addr, len, map->addr, map->len)) {
> +            map->invalidate(map->invalidate_opaque);
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +
> +void *__dma_memory_map(DMAContext *dma, DMAInvalidateMapFunc *cb,
> +                       void *cb_opaque, dma_addr_t addr, dma_addr_t *len,
> +                       DMADirection dir)
> +{
> +    int err;
> +    target_phys_addr_t paddr, plen;
> +    void *buf;
> +
> +    plen = *len;
> +    err = dma->translate(dma, addr, &paddr, &plen, dir);
> +    if (err) {
> +        return NULL;
> +    }
> +
> +    /*
> +     * If this is true, the virtual region is contiguous,
> +     * but the translated physical region isn't. We just
> +     * clamp *len, much like cpu_physical_memory_map() does.
> +     */
> +    if (plen < *len) {
> +        *len = plen;
> +    }
> +
> +    buf = cpu_physical_memory_map(paddr, &plen,
> +                                  dir == DMA_DIRECTION_FROM_DEVICE);
> +    *len = plen;
> +
> +    /* We treat maps as remote TLBs to cope with stuff like AIO. */
> +    if (cb) {
> +        DMAMemoryMap *map;
> +
> +        map = g_malloc(sizeof(DMAMemoryMap));
> +        map->addr = addr;
> +        map->len = *len;
> +        map->buf = buf;
> +        map->invalidate = cb;
> +        map->invalidate_opaque = cb_opaque;
> +
> +        QLIST_INSERT_HEAD(&dma->memory_maps, map, list);
> +    }
> +
> +    return buf;
> +}
> +
> +void __dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len,
> +                        DMADirection dir, dma_addr_t access_len)
> +{
> +    DMAMemoryMap *map;
> +
> +    cpu_physical_memory_unmap(buffer, len,
> +                              dir == DMA_DIRECTION_FROM_DEVICE,
> +                              access_len);
> +
> +    QLIST_FOREACH(map, &dma->memory_maps, list) {
> +        if ((map->buf == buffer) && (map->len == len)) {
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +#endif /* CONFIG_IOMMU */
> diff --git a/dma.h b/dma.h
> index a66e3d7..c27f8b3 100644
> --- a/dma.h
> +++ b/dma.h
> @@ -15,6 +15,7 @@
>  #include "hw/hw.h"
>  #include "block.h"
>  
> +typedef struct DMAContext DMAContext;
>  typedef struct ScatterGatherEntry ScatterGatherEntry;
>  
>  typedef enum {
> @@ -31,30 +32,83 @@ struct QEMUSGList {
>  };
>  
>  #if defined(TARGET_PHYS_ADDR_BITS)
> +
> +#ifdef CONFIG_IOMMU
> +/*
> + * When an IOMMU is present, bus addresses become distinct from
> + * CPU/memory physical addresses and may be a different size.  Because
> + * the IOVA size depends more on the bus than on the platform, we more
> + * or less have to treat these as 64-bit always to cover all (or at
> + * least most) cases.
> + */
> +typedef uint64_t dma_addr_t;
> +
> +#define DMA_ADDR_BITS 64
> +#define DMA_ADDR_FMT "%" PRIx64
> +#else
>  typedef target_phys_addr_t dma_addr_t;
>  
>  #define DMA_ADDR_BITS TARGET_PHYS_ADDR_BITS
>  #define DMA_ADDR_FMT TARGET_FMT_plx
> +#endif
> +
> +typedef int DMATranslateFunc(DMAContext *dma,
> +                             dma_addr_t addr,
> +                             target_phys_addr_t *paddr,
> +                             target_phys_addr_t *len,
> +                             DMADirection dir);
> +
> +typedef struct DMAContext {
> +#ifdef CONFIG_IOMMU
> +    DMATranslateFunc *translate;
> +    void *opaque;
> +    QLIST_HEAD(memory_maps, DMAMemoryMap) memory_maps;
> +#endif
> +} DMAContext;
> +

Here I suspect opaque is not needed at all.

> +#ifdef CONFIG_IOMMU
> +static inline bool dma_has_iommu(DMACon.text *dma)
> +{
> +    return (dma != NULL);
> +}

() not necessary here. Neither is != NULL.

> +#else
> +static inline bool dma_has_iommu(DMAContext *dma)
> +{
> +    return false;
> +}
> +#endif
>  
>  typedef void DMAInvalidateMapFunc(void *);
>  
>  /* Checks that the given range of addresses is valid for DMA.  This is
>   * useful for certain cases, but usually you should just use
>   * dma_memory_{read,write}() and check for errors */
> -static inline bool dma_memory_valid(DMAContext *dma, dma_addr_t addr,
> -                                    dma_addr_t len, DMADirection dir)
> +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> +                        DMADirection dir);
> +static inline bool dma_memory_valid(DMAContext *dma,
> +                                    dma_addr_t addr, dma_addr_t len,
> +                                    DMADirection dir)
>  {
> -    /* Stub version, with no iommu we assume all bus addresses are valid */
> -    return true;
> +    if (!dma_has_iommu(dma)) {
> +        return true;
> +    } else {
> +        return __dma_memory_valid(dma, addr, len, dir);
> +    }
>  }
>  
> +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> +                    void *buf, dma_addr_t len, DMADirection dir);
>  static inline int dma_memory_rw(DMAContext *dma, dma_addr_t addr,
>                                  void *buf, dma_addr_t len, DMADirection dir)
>  {
> -    /* Stub version when we have no iommu support */
> -    cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,
> -                           dir == DMA_DIRECTION_FROM_DEVICE);
> -    return 0;
> +    if (!dma_has_iommu(dma)) {
> +        /* Fast-path for no IOMMU */
> +        cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,

cast not needed, I think.

> +                               dir == DMA_DIRECTION_FROM_DEVICE);
> +        return 0;
> +    } else {
> +        return __dma_memory_rw(dma, addr, buf, len, dir);
> +    }
>  }
>  
>  static inline int dma_memory_read(DMAContext *dma, dma_addr_t addr,
> @@ -70,35 +124,55 @@ static inline int dma_memory_write(DMAContext *dma, dma_addr_t addr,
>                           DMA_DIRECTION_FROM_DEVICE);
>  }
>  
> +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len);
>  static inline int dma_memory_zero(DMAContext *dma, dma_addr_t addr,
>                                    dma_addr_t len)
>  {
> -    /* Stub version when we have no iommu support */
> -    cpu_physical_memory_zero(addr, len);
> -    return 0;
> +    if (!dma_has_iommu(dma)) {
> +        /* Fast-path for no IOMMU */
> +        cpu_physical_memory_zero(addr, len);
> +        return 0;
> +    } else {
> +        return __dma_memory_zero(dma, addr, len);
> +    }
>  }
>  
> +void *__dma_memory_map(DMAContext *dma,
> +                       DMAInvalidateMapFunc *cb, void *opaque,
> +                       dma_addr_t addr, dma_addr_t *len,
> +                       DMADirection dir);
>  static inline void *dma_memory_map(DMAContext *dma,
> -                                   DMAInvalidateMapFunc *cb, void *opaque,
> +                                   DMAInvalidateMapFunc *cb, void *cb_opaque,
>                                     dma_addr_t addr, dma_addr_t *len,
>                                     DMADirection dir)
>  {
> -    target_phys_addr_t xlen = *len;
> -    void *p;
> -
> -    p = cpu_physical_memory_map(addr, &xlen,
> -                                dir == DMA_DIRECTION_FROM_DEVICE);
> -    *len = xlen;
> -    return p;
> +    if (!dma_has_iommu(dma)) {
> +        target_phys_addr_t xlen = *len;
> +        void *p;
> +
> +        p = cpu_physical_memory_map(addr, &xlen,
> +                                    dir == DMA_DIRECTION_FROM_DEVICE);
> +        *len = xlen;
> +        return p;
> +    } else {
> +        return __dma_memory_map(dma, cb, cb_opaque, addr, len, dir);
> +    }
>  }

Are callbacks ever useful? I note that without an iommu, they are never
invoked. So how can a device use them?

>  
> +void __dma_memory_unmap(DMAContext *dma,
> +                        void *buffer, dma_addr_t len,
> +                        DMADirection dir, dma_addr_t access_len);
>  static inline void dma_memory_unmap(DMAContext *dma,
>                                      void *buffer, dma_addr_t len,
>                                      DMADirection dir, dma_addr_t access_len)
>  {
> -    return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
> -                                     dir == DMA_DIRECTION_FROM_DEVICE,
> -                                     access_len);
> +    if (!dma_has_iommu(dma)) {
> +        return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
> +                                         dir == DMA_DIRECTION_FROM_DEVICE,
> +                                         access_len);
> +    } else {
> +        __dma_memory_unmap(dma, buffer, len, dir, access_len);
> +    }
>  }
>  
>  #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \
> @@ -139,6 +213,14 @@ DEFINE_LDST_DMA(q, q, 64, be);
>  
>  #undef DEFINE_LDST_DMA
>  
> +#ifdef CONFIG_IOMMU
> +
> +void dma_context_init(DMAContext *dma, DMATranslateFunc, void *opaque);
> +void dma_invalidate_memory_range(DMAContext *dma,
> +                                 dma_addr_t addr, dma_addr_t len);
> +
> +#endif /* CONFIG_IOMMU */
> +
>  struct ScatterGatherEntry {
>      dma_addr_t base;
>      dma_addr_t len;
> -- 
> 1.7.9
David Gibson - March 2, 2012, 3:09 a.m.
On Thu, Mar 01, 2012 at 05:49:20PM +0200, Michael S. Tsirkin wrote:
> On Thu, Mar 01, 2012 at 04:36:06PM +1100, David Gibson wrote:
> > This patch adds the basic infrastructure necessary to emulate an IOMMU
> > visible to the guest.  The DMAContext structure is extended with
> > information and a callback describing the translation, and the various
> > DMA functions used by devices will now perform IOMMU translation using
> > this callback.
> > 
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Cc: Richard Henderson <rth@twiddle.net>
> > 
> > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  configure     |   12 ++++
> >  dma-helpers.c |  189 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 
> Why not just dma.c?

Because dma-helpers.c already includes most of the functions defined
in dma.h, and because it would increase risk of confusion with
hw/dma.c which contains code specific to the old ISA DMA controller.

> >  dma.h         |  126 +++++++++++++++++++++++++++++++-------
> >  3 files changed, 305 insertions(+), 22 deletions(-)
> > 
> > diff --git a/configure b/configure
> > index fb0e18e..3eff89c 100755
> > --- a/configure
> > +++ b/configure
> > @@ -138,6 +138,7 @@ linux_aio=""
> >  cap_ng=""
> >  attr=""
> >  libattr=""
> > +iommu="yes"
> >  xfs=""
> >  
> >  vhost_net="no"
> > @@ -784,6 +785,10 @@ for opt do
> >    ;;
> >    --enable-vhost-net) vhost_net="yes"
> >    ;;
> > +  --enable-iommu) iommu="yes"
> > +  ;;
> > +  --disable-iommu) iommu="no"
> > +  ;;
> >    --disable-opengl) opengl="no"
> >    ;;
> >    --enable-opengl) opengl="yes"
> > @@ -1085,6 +1090,8 @@ echo "  --enable-docs            enable documentation build"
> >  echo "  --disable-docs           disable documentation build"
> >  echo "  --disable-vhost-net      disable vhost-net acceleration support"
> >  echo "  --enable-vhost-net       enable vhost-net acceleration support"
> > +echo "  --disable-iommu          disable IOMMU emulation support"
> > +echo "  --enable-iommu           enable IOMMU emulation support"
> >  echo "  --enable-trace-backend=B Set trace backend"
> >  echo "                           Available backends:" $("$source_path"/scripts/tracetool --list-backends)
> >  echo "  --with-trace-file=NAME   Full PATH,NAME of file to store traces"
> > @@ -2935,6 +2942,7 @@ echo "posix_madvise     $posix_madvise"
> >  echo "uuid support      $uuid"
> >  echo "libcap-ng support $cap_ng"
> >  echo "vhost-net support $vhost_net"
> > +echo "IOMMU support     $iommu"
> >  echo "Trace backend     $trace_backend"
> >  echo "Trace output file $trace_file-<pid>"
> >  echo "spice support     $spice"
> > @@ -3812,6 +3820,10 @@ if test "$target_softmmu" = "yes" -a \( \
> >    echo "CONFIG_NEED_MMU=y" >> $config_target_mak
> >  fi
> >  
> > +if test "$iommu" = "yes" ; then
> > +  echo "CONFIG_IOMMU=y" >> $config_host_mak
> > +fi
> > +
> >  if test "$gprof" = "yes" ; then
> >    echo "TARGET_GPROF=yes" >> $config_target_mak
> >    if test "$target_linux_user" = "yes" ; then
> > diff --git a/dma-helpers.c b/dma-helpers.c
> > index 9dcfb2c..8269d60 100644
> > --- a/dma-helpers.c
> > +++ b/dma-helpers.c
> > @@ -10,6 +10,9 @@
> >  #include "dma.h"
> >  #include "block_int.h"
> >  #include "trace.h"
> > +#include "range.h"
> > +
> > +/*#define DEBUG_IOMMU*/
> >  
> >  void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma)
> >  {
> > @@ -255,3 +258,189 @@ void dma_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie,
> >  {
> >      bdrv_acct_start(bs, cookie, sg->size, type);
> >  }
> > +
> > +#ifdef CONFIG_IOMMU
> > +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> > +                        DMADirection dir)
> > +{
> > +    target_phys_addr_t paddr, plen;
> > +
> > +#ifdef DEBUG_DMA
> > +    fprintf(stderr, "dma_memory_check iommu=%p addr=0x%llx len=0x%llx dir=%d\n",
> > +            (unsigned long long)addr, (unsigned long long)len, dir);
> > +#endif
> > +
> > +    while (len) {
> > +        if (dma->translate(dma, addr, &paddr, &plen, dir) != 0) {
> > +            return false;
> > +        }
> > +
> > +        /* The translation might be valid for larger regions. */
> > +        if (plen > len) {
> > +            plen = len;
> > +        }
> > +
> > +        len -= plen;
> > +        addr += plen;
> > +    }
> > +
> > +    return true;
> > +}
> > +
> > +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> > +                    void *buf, dma_addr_t len, DMADirection dir)
> 
> By the way, users still ignore the return code
> from the pci routines.
> I suspect the API of returning errors is just not
> such a good one. What happens on read errors with
> real devices? Master abort? What about write errors?
> Maybe we need a callback
> that will set error flags in the device instead.

Well, it depends on the device.  I think in most cases it would
trigger some sort of error inside the device, which would most easily
be handled by an error return from here.  In some cases it might put
the whole bus in an error state.

I don't think treating the error state of these calls as optional
information for the driver is necessarily a problem.  As long as
accesses to not IOMMU mapped addresses don't lead to actual accesses
to guest memory (which they don't) we've satisfied the main semantic
of the IOMMU.

Whether and how the devices respond to the error condition is then a
matter of how complete and accurate we want their modelling of error
states to be.  That seems to me to be in the nice-to-have but not
really that vital category, which we can leave until someone wants it.

> > +{
> > +    target_phys_addr_t paddr, plen;
> > +    int err;
> > +
> > +#ifdef DEBUG_IOMMU
> > +    fprintf(stderr, "dma_memory_rw iommu=%p addr=0x%llx len=0x%llx dir=%d\n",
> > +            (unsigned long long)addr, (unsigned long long)len, dir);
> > +#endif
> > +
> > +    while (len) {
> > +        err = dma->translate(dma, addr, &paddr, &plen, dir);
> > +        if (err) {
> > +            return -1;
> 
> Why not return err?

No good reason, I'll fix that.

> > +        }
> > +
> > +        /* The translation might be valid for larger regions. */
> > +        if (plen > len) {
> > +            plen = len;
> > +        }
> > +
> > +        cpu_physical_memory_rw(paddr, buf, plen,
> > +                               dir == DMA_DIRECTION_FROM_DEVICE);
> > +
> > +        len -= plen;
> > +        addr += plen;
> > +        buf += plen;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len)
> > +{
> > +    target_phys_addr_t paddr, plen;
> > +    int err;
> > +
> > +#ifdef DEBUG_IOMMU
> > +    fprintf(stderr, "dma_memory_zero iommu=%p addr=0x%llx len=0x%llx\n",
> > +            (unsigned long long)addr, (unsigned long long)len);
> > +#endif
> > +
> > +    while (len) {
> > +        err = dma->translate(dma, addr, &paddr, &plen,
> > +                             DMA_DIRECTION_FROM_DEVICE);
> > +        if (err) {
> > +            return -1;
> > +        }
> > +
> > +        /* The translation might be valid for larger regions. */
> > +        if (plen > len) {
> > +            plen = len;
> > +        }
> > +
> > +        cpu_physical_memory_zero(paddr, plen);
> > +
> > +        len -= plen;
> > +        addr += plen;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +typedef struct DMAMemoryMap DMAMemoryMap;
> > +struct DMAMemoryMap {
> > +    dma_addr_t              addr;
> > +    size_t                  len;
> > +    void                    *buf;
> > +    DMAInvalidateMapFunc    *invalidate;
> > +    void                    *invalidate_opaque;
> 
> Do we absolutely need the opaque value?
> Can we let the user allocate the map objects?
> Then user can use safe container_of to get user data.

So, I was wondering whether having users allocate the DMAMemoryMap
structure might be better.  It has some advantages, though it does
mean more invasive changes to the callers.  I'll take that as a vote
in favour of that alternative approach.

> > +    QLIST_ENTRY(DMAMemoryMap) list;
> > +};
> > +
> > +void dma_context_init(DMAContext *dma, DMATranslateFunc fn, void *opaque)
> > +{
> > +    dma->translate = fn;
> > +    dma->opaque = opaque;
> > +    QLIST_INIT(&dma->memory_maps);
> > +}
> > +
> > +void dma_invalidate_memory_range(DMAContext *dma,
> > +                                 dma_addr_t addr, dma_addr_t len)
> > +{
> > +    DMAMemoryMap *map;
> > +
> > +    QLIST_FOREACH(map, &dma->memory_maps, list) {
> > +        if (ranges_overlap(addr, len, map->addr, map->len)) {
> > +            map->invalidate(map->invalidate_opaque);
> > +            QLIST_REMOVE(map, list);
> > +            free(map);
> > +        }
> > +    }
> > +}
> > +
> > +void *__dma_memory_map(DMAContext *dma, DMAInvalidateMapFunc *cb,
> > +                       void *cb_opaque, dma_addr_t addr, dma_addr_t *len,
> > +                       DMADirection dir)
> > +{
> > +    int err;
> > +    target_phys_addr_t paddr, plen;
> > +    void *buf;
> > +
> > +    plen = *len;
> > +    err = dma->translate(dma, addr, &paddr, &plen, dir);
> > +    if (err) {
> > +        return NULL;
> > +    }
> > +
> > +    /*
> > +     * If this is true, the virtual region is contiguous,
> > +     * but the translated physical region isn't. We just
> > +     * clamp *len, much like cpu_physical_memory_map() does.
> > +     */
> > +    if (plen < *len) {
> > +        *len = plen;
> > +    }
> > +
> > +    buf = cpu_physical_memory_map(paddr, &plen,
> > +                                  dir == DMA_DIRECTION_FROM_DEVICE);
> > +    *len = plen;
> > +
> > +    /* We treat maps as remote TLBs to cope with stuff like AIO. */
> > +    if (cb) {
> > +        DMAMemoryMap *map;
> > +
> > +        map = g_malloc(sizeof(DMAMemoryMap));
> > +        map->addr = addr;
> > +        map->len = *len;
> > +        map->buf = buf;
> > +        map->invalidate = cb;
> > +        map->invalidate_opaque = cb_opaque;
> > +
> > +        QLIST_INSERT_HEAD(&dma->memory_maps, map, list);
> > +    }
> > +
> > +    return buf;
> > +}
> > +
> > +void __dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len,
> > +                        DMADirection dir, dma_addr_t access_len)
> > +{
> > +    DMAMemoryMap *map;
> > +
> > +    cpu_physical_memory_unmap(buffer, len,
> > +                              dir == DMA_DIRECTION_FROM_DEVICE,
> > +                              access_len);
> > +
> > +    QLIST_FOREACH(map, &dma->memory_maps, list) {
> > +        if ((map->buf == buffer) && (map->len == len)) {
> > +            QLIST_REMOVE(map, list);
> > +            free(map);
> > +        }
> > +    }
> > +}
> > +#endif /* CONFIG_IOMMU */
> > diff --git a/dma.h b/dma.h
> > index a66e3d7..c27f8b3 100644
> > --- a/dma.h
> > +++ b/dma.h
> > @@ -15,6 +15,7 @@
> >  #include "hw/hw.h"
> >  #include "block.h"
> >  
> > +typedef struct DMAContext DMAContext;
> >  typedef struct ScatterGatherEntry ScatterGatherEntry;
> >  
> >  typedef enum {
> > @@ -31,30 +32,83 @@ struct QEMUSGList {
> >  };
> >  
> >  #if defined(TARGET_PHYS_ADDR_BITS)
> > +
> > +#ifdef CONFIG_IOMMU
> > +/*
> > + * When an IOMMU is present, bus addresses become distinct from
> > + * CPU/memory physical addresses and may be a different size.  Because
> > + * the IOVA size depends more on the bus than on the platform, we more
> > + * or less have to treat these as 64-bit always to cover all (or at
> > + * least most) cases.
> > + */
> > +typedef uint64_t dma_addr_t;
> > +
> > +#define DMA_ADDR_BITS 64
> > +#define DMA_ADDR_FMT "%" PRIx64
> > +#else
> >  typedef target_phys_addr_t dma_addr_t;
> >  
> >  #define DMA_ADDR_BITS TARGET_PHYS_ADDR_BITS
> >  #define DMA_ADDR_FMT TARGET_FMT_plx
> > +#endif
> > +
> > +typedef int DMATranslateFunc(DMAContext *dma,
> > +                             dma_addr_t addr,
> > +                             target_phys_addr_t *paddr,
> > +                             target_phys_addr_t *len,
> > +                             DMADirection dir);
> > +
> > +typedef struct DMAContext {
> > +#ifdef CONFIG_IOMMU
> > +    DMATranslateFunc *translate;
> > +    void *opaque;
> > +    QLIST_HEAD(memory_maps, DMAMemoryMap) memory_maps;
> > +#endif
> > +} DMAContext;
> > +
> 
> Here I suspect opaque is not needed at all.

Ok, so.  It's certainly not needed for all cases - I don't use it in
the example PAPR IOMMU.  The use cases I had in mind for this are for
things more like the Intel IOMMU which can be set up to have a
different domain (i.e. DMAContext) for each PCI function.  I was
thinking in that case that opaque could be used to store a pointer to
the relevant PCIDevice.

Or alternatively, container_of could be used to access per-context
IOMMU specific information, but opaque could be used to get at a
global IOMMU state structure.

But now you come to mention it, those cases can probably be handled by
extra pointers in the containing structure.  So I'll drop this in the
next version.

> > +#ifdef CONFIG_IOMMU
> > +static inline bool dma_has_iommu(DMACon.text *dma)
> > +{
> > +    return (dma != NULL);
> > +}
> 
> () not necessary here. Neither is != NULL.

I know, but I was trying to be clearer and make the pointer -> bool
conversion explicit here.

> > +#else
> > +static inline bool dma_has_iommu(DMAContext *dma)
> > +{
> > +    return false;
> > +}
> > +#endif
> >  
> >  typedef void DMAInvalidateMapFunc(void *);
> >  
> >  /* Checks that the given range of addresses is valid for DMA.  This is
> >   * useful for certain cases, but usually you should just use
> >   * dma_memory_{read,write}() and check for errors */
> > -static inline bool dma_memory_valid(DMAContext *dma, dma_addr_t addr,
> > -                                    dma_addr_t len, DMADirection dir)
> > +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> > +                        DMADirection dir);
> > +static inline bool dma_memory_valid(DMAContext *dma,
> > +                                    dma_addr_t addr, dma_addr_t len,
> > +                                    DMADirection dir)
> >  {
> > -    /* Stub version, with no iommu we assume all bus addresses are valid */
> > -    return true;
> > +    if (!dma_has_iommu(dma)) {
> > +        return true;
> > +    } else {
> > +        return __dma_memory_valid(dma, addr, len, dir);
> > +    }
> >  }
> >  
> > +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> > +                    void *buf, dma_addr_t len, DMADirection dir);
> >  static inline int dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> >                                  void *buf, dma_addr_t len, DMADirection dir)
> >  {
> > -    /* Stub version when we have no iommu support */
> > -    cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,
> > -                           dir == DMA_DIRECTION_FROM_DEVICE);
> > -    return 0;
> > +    if (!dma_has_iommu(dma)) {
> > +        /* Fast-path for no IOMMU */
> > +        cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,
> 
> cast not needed, I think.

Ah, probably true, I'll remove it.

> > +                               dir == DMA_DIRECTION_FROM_DEVICE);
> > +        return 0;
> > +    } else {
> > +        return __dma_memory_rw(dma, addr, buf, len, dir);
> > +    }
> >  }
> >  
> >  static inline int dma_memory_read(DMAContext *dma, dma_addr_t addr,
> > @@ -70,35 +124,55 @@ static inline int dma_memory_write(DMAContext *dma, dma_addr_t addr,
> >                           DMA_DIRECTION_FROM_DEVICE);
> >  }
> >  
> > +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len);
> >  static inline int dma_memory_zero(DMAContext *dma, dma_addr_t addr,
> >                                    dma_addr_t len)
> >  {
> > -    /* Stub version when we have no iommu support */
> > -    cpu_physical_memory_zero(addr, len);
> > -    return 0;
> > +    if (!dma_has_iommu(dma)) {
> > +        /* Fast-path for no IOMMU */
> > +        cpu_physical_memory_zero(addr, len);
> > +        return 0;
> > +    } else {
> > +        return __dma_memory_zero(dma, addr, len);
> > +    }
> >  }
> >  
> > +void *__dma_memory_map(DMAContext *dma,
> > +                       DMAInvalidateMapFunc *cb, void *opaque,
> > +                       dma_addr_t addr, dma_addr_t *len,
> > +                       DMADirection dir);
> >  static inline void *dma_memory_map(DMAContext *dma,
> > -                                   DMAInvalidateMapFunc *cb, void *opaque,
> > +                                   DMAInvalidateMapFunc *cb, void *cb_opaque,
> >                                     dma_addr_t addr, dma_addr_t *len,
> >                                     DMADirection dir)
> >  {
> > -    target_phys_addr_t xlen = *len;
> > -    void *p;
> > -
> > -    p = cpu_physical_memory_map(addr, &xlen,
> > -                                dir == DMA_DIRECTION_FROM_DEVICE);
> > -    *len = xlen;
> > -    return p;
> > +    if (!dma_has_iommu(dma)) {
> > +        target_phys_addr_t xlen = *len;
> > +        void *p;
> > +
> > +        p = cpu_physical_memory_map(addr, &xlen,
> > +                                    dir == DMA_DIRECTION_FROM_DEVICE);
> > +        *len = xlen;
> > +        return p;
> > +    } else {
> > +        return __dma_memory_map(dma, cb, cb_opaque, addr, len, dir);
> > +    }
> >  }
> 
> Are callbacks ever useful? I note that without an iommu, they are never
> invoked. So how can a device use them?

So, the only real user of callbacks in this series is the dma_bdrv
code, where the callback cancels the remainder of the AIO in
progress.  The semantics of the callback are supposed to be that once
the callback is complete, the device code will make no further
accesses to the mapped memory.

But this is a somewhat curly issue, and I've certainly thought about
other approaches to this.  Several ways of handling
invalidate-while-mapped have occurred to me:

1) Ignore the problem (no callback, or optional callback). If the
guest doesn't quiesce the device to cancel DMAs before removing IOMMU
mappins, well, too bad, it shoots itself in the foot, and in-progress
DMAs may still clobber no-longer-mapped guest memory.

The problem with that approach is that one use of an IOMMU is to
forcibly quiesce a device (including one in a broken state) by
prohibiting all its DMAs, then dealing with resetting the device later
on.  This approach will not faithfully model that use case.

2) The current approach, have a callback with the strong semantic
requirement that no further access occur after the callback is
completed.  The difficulty with that is that synchronization between
the callback and the code using the mapping could be hairy, especially
if qemu becomes multi-threaded in future.

3) "Lock" the iommu.  When maps are in use, mark the IOMMU such that
any invalidates must be delayed until the unmap happens.

4) Instead of dma_memory_map() providing direct pointers into guest
memory, use mremap() to obtain the new mapping.  On invalidate,
replace the mapping with a dummy chunk of memory.  That way the device
code doesn't SEGV and doesn't need synchronization with the
invalidate, but won't be able to clobber guest memory after the
invalidate.  This might entirely cancel the performance reasons for
using dma_memory_map() in the first place, though.

Oh, and I suppose for completeness there's"

5) Remove dma_memory_map() entirely, convert all devices to use
dma_memory_rw() instead.

So suggestions here welcome.

> > +void __dma_memory_unmap(DMAContext *dma,
> > +                        void *buffer, dma_addr_t len,
> > +                        DMADirection dir, dma_addr_t access_len);
> >  static inline void dma_memory_unmap(DMAContext *dma,
> >                                      void *buffer, dma_addr_t len,
> >                                      DMADirection dir, dma_addr_t access_len)
> >  {
> > -    return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
> > -                                     dir == DMA_DIRECTION_FROM_DEVICE,
> > -                                     access_len);
> > +    if (!dma_has_iommu(dma)) {
> > +        return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
> > +                                         dir == DMA_DIRECTION_FROM_DEVICE,
> > +                                         access_len);
> > +    } else {
> > +        __dma_memory_unmap(dma, buffer, len, dir, access_len);
> > +    }
> >  }
> >  
> >  #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \
> > @@ -139,6 +213,14 @@ DEFINE_LDST_DMA(q, q, 64, be);
> >  
> >  #undef DEFINE_LDST_DMA
> >  
> > +#ifdef CONFIG_IOMMU
> > +
> > +void dma_context_init(DMAContext *dma, DMATranslateFunc, void *opaque);
> > +void dma_invalidate_memory_range(DMAContext *dma,
> > +                                 dma_addr_t addr, dma_addr_t len);
> > +
> > +#endif /* CONFIG_IOMMU */
> > +
> >  struct ScatterGatherEntry {
> >      dma_addr_t base;
> >      dma_addr_t len;

Patch

diff --git a/configure b/configure
index fb0e18e..3eff89c 100755
--- a/configure
+++ b/configure
@@ -138,6 +138,7 @@  linux_aio=""
 cap_ng=""
 attr=""
 libattr=""
+iommu="yes"
 xfs=""
 
 vhost_net="no"
@@ -784,6 +785,10 @@  for opt do
   ;;
   --enable-vhost-net) vhost_net="yes"
   ;;
+  --enable-iommu) iommu="yes"
+  ;;
+  --disable-iommu) iommu="no"
+  ;;
   --disable-opengl) opengl="no"
   ;;
   --enable-opengl) opengl="yes"
@@ -1085,6 +1090,8 @@  echo "  --enable-docs            enable documentation build"
 echo "  --disable-docs           disable documentation build"
 echo "  --disable-vhost-net      disable vhost-net acceleration support"
 echo "  --enable-vhost-net       enable vhost-net acceleration support"
+echo "  --disable-iommu          disable IOMMU emulation support"
+echo "  --enable-iommu           enable IOMMU emulation support"
 echo "  --enable-trace-backend=B Set trace backend"
 echo "                           Available backends:" $("$source_path"/scripts/tracetool --list-backends)
 echo "  --with-trace-file=NAME   Full PATH,NAME of file to store traces"
@@ -2935,6 +2942,7 @@  echo "posix_madvise     $posix_madvise"
 echo "uuid support      $uuid"
 echo "libcap-ng support $cap_ng"
 echo "vhost-net support $vhost_net"
+echo "IOMMU support     $iommu"
 echo "Trace backend     $trace_backend"
 echo "Trace output file $trace_file-<pid>"
 echo "spice support     $spice"
@@ -3812,6 +3820,10 @@  if test "$target_softmmu" = "yes" -a \( \
   echo "CONFIG_NEED_MMU=y" >> $config_target_mak
 fi
 
+if test "$iommu" = "yes" ; then
+  echo "CONFIG_IOMMU=y" >> $config_host_mak
+fi
+
 if test "$gprof" = "yes" ; then
   echo "TARGET_GPROF=yes" >> $config_target_mak
   if test "$target_linux_user" = "yes" ; then
diff --git a/dma-helpers.c b/dma-helpers.c
index 9dcfb2c..8269d60 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -10,6 +10,9 @@ 
 #include "dma.h"
 #include "block_int.h"
 #include "trace.h"
+#include "range.h"
+
+/*#define DEBUG_IOMMU*/
 
 void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma)
 {
@@ -255,3 +258,189 @@  void dma_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie,
 {
     bdrv_acct_start(bs, cookie, sg->size, type);
 }
+
+#ifdef CONFIG_IOMMU
+bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
+                        DMADirection dir)
+{
+    target_phys_addr_t paddr, plen;
+
+#ifdef DEBUG_DMA
+    fprintf(stderr, "dma_memory_check iommu=%p addr=0x%llx len=0x%llx dir=%d\n",
+            (unsigned long long)addr, (unsigned long long)len, dir);
+#endif
+
+    while (len) {
+        if (dma->translate(dma, addr, &paddr, &plen, dir) != 0) {
+            return false;
+        }
+
+        /* The translation might be valid for larger regions. */
+        if (plen > len) {
+            plen = len;
+        }
+
+        len -= plen;
+        addr += plen;
+    }
+
+    return true;
+}
+
+int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
+                    void *buf, dma_addr_t len, DMADirection dir)
+{
+    target_phys_addr_t paddr, plen;
+    int err;
+
+#ifdef DEBUG_IOMMU
+    fprintf(stderr, "dma_memory_rw iommu=%p addr=0x%llx len=0x%llx dir=%d\n",
+            (unsigned long long)addr, (unsigned long long)len, dir);
+#endif
+
+    while (len) {
+        err = dma->translate(dma, addr, &paddr, &plen, dir);
+        if (err) {
+            return -1;
+        }
+
+        /* The translation might be valid for larger regions. */
+        if (plen > len) {
+            plen = len;
+        }
+
+        cpu_physical_memory_rw(paddr, buf, plen,
+                               dir == DMA_DIRECTION_FROM_DEVICE);
+
+        len -= plen;
+        addr += plen;
+        buf += plen;
+    }
+
+    return 0;
+}
+
+int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len)
+{
+    target_phys_addr_t paddr, plen;
+    int err;
+
+#ifdef DEBUG_IOMMU
+    fprintf(stderr, "dma_memory_zero iommu=%p addr=0x%llx len=0x%llx\n",
+            (unsigned long long)addr, (unsigned long long)len);
+#endif
+
+    while (len) {
+        err = dma->translate(dma, addr, &paddr, &plen,
+                             DMA_DIRECTION_FROM_DEVICE);
+        if (err) {
+            return -1;
+        }
+
+        /* The translation might be valid for larger regions. */
+        if (plen > len) {
+            plen = len;
+        }
+
+        cpu_physical_memory_zero(paddr, plen);
+
+        len -= plen;
+        addr += plen;
+    }
+
+    return 0;
+}
+
+typedef struct DMAMemoryMap DMAMemoryMap;
+struct DMAMemoryMap {
+    dma_addr_t              addr;
+    size_t                  len;
+    void                    *buf;
+    DMAInvalidateMapFunc    *invalidate;
+    void                    *invalidate_opaque;
+
+    QLIST_ENTRY(DMAMemoryMap) list;
+};
+
+void dma_context_init(DMAContext *dma, DMATranslateFunc fn, void *opaque)
+{
+    dma->translate = fn;
+    dma->opaque = opaque;
+    QLIST_INIT(&dma->memory_maps);
+}
+
+void dma_invalidate_memory_range(DMAContext *dma,
+                                 dma_addr_t addr, dma_addr_t len)
+{
+    DMAMemoryMap *map;
+
+    QLIST_FOREACH(map, &dma->memory_maps, list) {
+        if (ranges_overlap(addr, len, map->addr, map->len)) {
+            map->invalidate(map->invalidate_opaque);
+            QLIST_REMOVE(map, list);
+            free(map);
+        }
+    }
+}
+
+void *__dma_memory_map(DMAContext *dma, DMAInvalidateMapFunc *cb,
+                       void *cb_opaque, dma_addr_t addr, dma_addr_t *len,
+                       DMADirection dir)
+{
+    int err;
+    target_phys_addr_t paddr, plen;
+    void *buf;
+
+    plen = *len;
+    err = dma->translate(dma, addr, &paddr, &plen, dir);
+    if (err) {
+        return NULL;
+    }
+
+    /*
+     * If this is true, the virtual region is contiguous,
+     * but the translated physical region isn't. We just
+     * clamp *len, much like cpu_physical_memory_map() does.
+     */
+    if (plen < *len) {
+        *len = plen;
+    }
+
+    buf = cpu_physical_memory_map(paddr, &plen,
+                                  dir == DMA_DIRECTION_FROM_DEVICE);
+    *len = plen;
+
+    /* We treat maps as remote TLBs to cope with stuff like AIO. */
+    if (cb) {
+        DMAMemoryMap *map;
+
+        map = g_malloc(sizeof(DMAMemoryMap));
+        map->addr = addr;
+        map->len = *len;
+        map->buf = buf;
+        map->invalidate = cb;
+        map->invalidate_opaque = cb_opaque;
+
+        QLIST_INSERT_HEAD(&dma->memory_maps, map, list);
+    }
+
+    return buf;
+}
+
+void __dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len,
+                        DMADirection dir, dma_addr_t access_len)
+{
+    DMAMemoryMap *map;
+
+    cpu_physical_memory_unmap(buffer, len,
+                              dir == DMA_DIRECTION_FROM_DEVICE,
+                              access_len);
+
+    QLIST_FOREACH(map, &dma->memory_maps, list) {
+        if ((map->buf == buffer) && (map->len == len)) {
+            QLIST_REMOVE(map, list);
+            free(map);
+        }
+    }
+}
+#endif /* CONFIG_IOMMU */
diff --git a/dma.h b/dma.h
index a66e3d7..c27f8b3 100644
--- a/dma.h
+++ b/dma.h
@@ -15,6 +15,7 @@ 
 #include "hw/hw.h"
 #include "block.h"
 
+typedef struct DMAContext DMAContext;
 typedef struct ScatterGatherEntry ScatterGatherEntry;
 
 typedef enum {
@@ -31,30 +32,83 @@  struct QEMUSGList {
 };
 
 #if defined(TARGET_PHYS_ADDR_BITS)
+
+#ifdef CONFIG_IOMMU
+/*
+ * When an IOMMU is present, bus addresses become distinct from
+ * CPU/memory physical addresses and may be a different size.  Because
+ * the IOVA size depends more on the bus than on the platform, we more
+ * or less have to treat these as 64-bit always to cover all (or at
+ * least most) cases.
+ */
+typedef uint64_t dma_addr_t;
+
+#define DMA_ADDR_BITS 64
+#define DMA_ADDR_FMT "%" PRIx64
+#else
 typedef target_phys_addr_t dma_addr_t;
 
 #define DMA_ADDR_BITS TARGET_PHYS_ADDR_BITS
 #define DMA_ADDR_FMT TARGET_FMT_plx
+#endif
+
+typedef int DMATranslateFunc(DMAContext *dma,
+                             dma_addr_t addr,
+                             target_phys_addr_t *paddr,
+                             target_phys_addr_t *len,
+                             DMADirection dir);
+
+typedef struct DMAContext {
+#ifdef CONFIG_IOMMU
+    DMATranslateFunc *translate;
+    void *opaque;
+    QLIST_HEAD(memory_maps, DMAMemoryMap) memory_maps;
+#endif
+} DMAContext;
+
+#ifdef CONFIG_IOMMU
+static inline bool dma_has_iommu(DMAContext *dma)
+{
+    return (dma != NULL);
+}
+#else
+static inline bool dma_has_iommu(DMAContext *dma)
+{
+    return false;
+}
+#endif
 
 typedef void DMAInvalidateMapFunc(void *);
 
 /* Checks that the given range of addresses is valid for DMA.  This is
  * useful for certain cases, but usually you should just use
  * dma_memory_{read,write}() and check for errors */
-static inline bool dma_memory_valid(DMAContext *dma, dma_addr_t addr,
-                                    dma_addr_t len, DMADirection dir)
+bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
+                        DMADirection dir);
+static inline bool dma_memory_valid(DMAContext *dma,
+                                    dma_addr_t addr, dma_addr_t len,
+                                    DMADirection dir)
 {
-    /* Stub version, with no iommu we assume all bus addresses are valid */
-    return true;
+    if (!dma_has_iommu(dma)) {
+        return true;
+    } else {
+        return __dma_memory_valid(dma, addr, len, dir);
+    }
 }
 
+int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
+                    void *buf, dma_addr_t len, DMADirection dir);
 static inline int dma_memory_rw(DMAContext *dma, dma_addr_t addr,
                                 void *buf, dma_addr_t len, DMADirection dir)
 {
-    /* Stub version when we have no iommu support */
-    cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,
-                           dir == DMA_DIRECTION_FROM_DEVICE);
-    return 0;
+    if (!dma_has_iommu(dma)) {
+        /* Fast-path for no IOMMU */
+        cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,
+                               dir == DMA_DIRECTION_FROM_DEVICE);
+        return 0;
+    } else {
+        return __dma_memory_rw(dma, addr, buf, len, dir);
+    }
 }
 
 static inline int dma_memory_read(DMAContext *dma, dma_addr_t addr,
@@ -70,35 +124,55 @@  static inline int dma_memory_write(DMAContext *dma, dma_addr_t addr,
                          DMA_DIRECTION_FROM_DEVICE);
 }
 
+int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len);
 static inline int dma_memory_zero(DMAContext *dma, dma_addr_t addr,
                                   dma_addr_t len)
 {
-    /* Stub version when we have no iommu support */
-    cpu_physical_memory_zero(addr, len);
-    return 0;
+    if (!dma_has_iommu(dma)) {
+        /* Fast-path for no IOMMU */
+        cpu_physical_memory_zero(addr, len);
+        return 0;
+    } else {
+        return __dma_memory_zero(dma, addr, len);
+    }
 }
 
+void *__dma_memory_map(DMAContext *dma,
+                       DMAInvalidateMapFunc *cb, void *opaque,
+                       dma_addr_t addr, dma_addr_t *len,
+                       DMADirection dir);
 static inline void *dma_memory_map(DMAContext *dma,
-                                   DMAInvalidateMapFunc *cb, void *opaque,
+                                   DMAInvalidateMapFunc *cb, void *cb_opaque,
                                    dma_addr_t addr, dma_addr_t *len,
                                    DMADirection dir)
 {
-    target_phys_addr_t xlen = *len;
-    void *p;
-
-    p = cpu_physical_memory_map(addr, &xlen,
-                                dir == DMA_DIRECTION_FROM_DEVICE);
-    *len = xlen;
-    return p;
+    if (!dma_has_iommu(dma)) {
+        target_phys_addr_t xlen = *len;
+        void *p;
+
+        p = cpu_physical_memory_map(addr, &xlen,
+                                    dir == DMA_DIRECTION_FROM_DEVICE);
+        *len = xlen;
+        return p;
+    } else {
+        return __dma_memory_map(dma, cb, cb_opaque, addr, len, dir);
+    }
 }
 
+void __dma_memory_unmap(DMAContext *dma,
+                        void *buffer, dma_addr_t len,
+                        DMADirection dir, dma_addr_t access_len);
 static inline void dma_memory_unmap(DMAContext *dma,
                                     void *buffer, dma_addr_t len,
                                     DMADirection dir, dma_addr_t access_len)
 {
-    return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
-                                     dir == DMA_DIRECTION_FROM_DEVICE,
-                                     access_len);
+    if (!dma_has_iommu(dma)) {
+        return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
+                                         dir == DMA_DIRECTION_FROM_DEVICE,
+                                         access_len);
+    } else {
+        __dma_memory_unmap(dma, buffer, len, dir, access_len);
+    }
 }
 
 #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \
@@ -139,6 +213,14 @@  DEFINE_LDST_DMA(q, q, 64, be);
 
 #undef DEFINE_LDST_DMA
 
+#ifdef CONFIG_IOMMU
+
+void dma_context_init(DMAContext *dma, DMATranslateFunc, void *opaque);
+void dma_invalidate_memory_range(DMAContext *dma,
+                                 dma_addr_t addr, dma_addr_t len);
+
+#endif /* CONFIG_IOMMU */
+
 struct ScatterGatherEntry {
     dma_addr_t base;
     dma_addr_t len;