Patchwork [10/13] iommu: Introduce IOMMU emulation infrastructure

login
register
mail settings
Submitter David Gibson
Date March 9, 2012, 5:01 a.m.
Message ID <1331269308-22372-11-git-send-email-david@gibson.dropbear.id.au>
Download mbox | patch
Permalink /patch/145659/
State New
Headers show

Comments

David Gibson - March 9, 2012, 5:01 a.m.
This patch adds the basic infrastructure necessary to emulate an IOMMU
visible to the guest.  The DMAContext structure is extended with
information and a callback describing the translation, and the various
DMA functions used by devices will now perform IOMMU translation using
this callback.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Richard Henderson <rth@twiddle.net>

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 configure     |   12 ++++
 dma-helpers.c |  191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 dma.h         |  125 +++++++++++++++++++++++++++++++-------
 3 files changed, 306 insertions(+), 22 deletions(-)
Paolo Bonzini - March 9, 2012, 10:23 a.m.
Il 09/03/2012 06:01, David Gibson ha scritto:
> This patch adds the basic infrastructure necessary to emulate an IOMMU
> visible to the guest.  The DMAContext structure is extended with
> information and a callback describing the translation, and the various
> DMA functions used by devices will now perform IOMMU translation using
> this callback.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Richard Henderson <rth@twiddle.net>
> 
> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  configure     |   12 ++++
>  dma-helpers.c |  191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  dma.h         |  125 +++++++++++++++++++++++++++++++-------
>  3 files changed, 306 insertions(+), 22 deletions(-)
> 
> diff --git a/configure b/configure
> index a5eb832..e6fba2f 100755
> --- a/configure
> +++ b/configure
> @@ -138,6 +138,7 @@ linux_aio=""
>  cap_ng=""
>  attr=""
>  libattr=""
> +iommu="yes"
>  xfs=""
>  
>  vhost_net="no"
> @@ -784,6 +785,10 @@ for opt do
>    ;;
>    --enable-vhost-net) vhost_net="yes"
>    ;;
> +  --enable-iommu) iommu="yes"
> +  ;;
> +  --disable-iommu) iommu="no"
> +  ;;

This need not be a configure option.  Just enable it, or it will bitrot.

>    --disable-opengl) opengl="no"
>    ;;
>    --enable-opengl) opengl="yes"
> @@ -1085,6 +1090,8 @@ echo "  --enable-docs            enable documentation build"
>  echo "  --disable-docs           disable documentation build"
>  echo "  --disable-vhost-net      disable vhost-net acceleration support"
>  echo "  --enable-vhost-net       enable vhost-net acceleration support"
> +echo "  --disable-iommu          disable IOMMU emulation support"
> +echo "  --enable-iommu           enable IOMMU emulation support"
>  echo "  --enable-trace-backend=B Set trace backend"
>  echo "                           Available backends:" $("$source_path"/scripts/tracetool --list-backends)
>  echo "  --with-trace-file=NAME   Full PATH,NAME of file to store traces"
> @@ -2924,6 +2931,7 @@ echo "posix_madvise     $posix_madvise"
>  echo "uuid support      $uuid"
>  echo "libcap-ng support $cap_ng"
>  echo "vhost-net support $vhost_net"
> +echo "IOMMU support     $iommu"
>  echo "Trace backend     $trace_backend"
>  echo "Trace output file $trace_file-<pid>"
>  echo "spice support     $spice"
> @@ -3801,6 +3809,10 @@ if test "$target_softmmu" = "yes" -a \( \
>    echo "CONFIG_NEED_MMU=y" >> $config_target_mak
>  fi
>  
> +if test "$iommu" = "yes" ; then
> +  echo "CONFIG_IOMMU=y" >> $config_host_mak
> +fi
> +
>  if test "$gprof" = "yes" ; then
>    echo "TARGET_GPROF=yes" >> $config_target_mak
>    if test "$target_linux_user" = "yes" ; then
> diff --git a/dma-helpers.c b/dma-helpers.c
> index 9dcfb2c..248575d 100644
> --- a/dma-helpers.c
> +++ b/dma-helpers.c
> @@ -10,6 +10,9 @@
>  #include "dma.h"
>  #include "block_int.h"
>  #include "trace.h"
> +#include "range.h"
> +
> +/* #define DEBUG_IOMMU */
>  
>  void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma)
>  {
> @@ -255,3 +258,191 @@ void dma_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie,
>  {
>      bdrv_acct_start(bs, cookie, sg->size, type);
>  }
> +
> +#ifdef CONFIG_IOMMU
> +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> +                        DMADirection dir)

No __ identifiers, please.  You could call these
dma_context_memory_{valid,read,write}.

> +{
> +    target_phys_addr_t paddr, plen;
> +
> +#ifdef DEBUG_IOMMU
> +    fprintf(stderr, "dma_memory_check context=%p addr=0x" DMA_ADDR_FMT
> +            " len=0x" DMA_ADDR_FMT " dir=%d\n", dma, addr, len, dir);
> +#endif
> +
> +    while (len) {
> +        if (dma->translate(dma, addr, &paddr, &plen, dir) != 0) {
> +            return false;
> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        len -= plen;
> +        addr += plen;
> +    }
> +
> +    return true;
> +}
> +
> +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> +                    void *buf, dma_addr_t len, DMADirection dir)
> +{
> +    target_phys_addr_t paddr, plen;
> +    int err;
> +
> +#ifdef DEBUG_IOMMU
> +    fprintf(stderr, "dma_memory_rw context=%p addr=0x" DMA_ADDR_FMT " len=0x"
> +            DMA_ADDR_FMT " dir=%d\n", dma, addr, len, dir);
> +#endif
> +
> +    while (len) {
> +        err = dma->translate(dma, addr, &paddr, &plen, dir);
> +        if (err) {
> +            return -1;
> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        cpu_physical_memory_rw(paddr, buf, plen,
> +                               dir == DMA_DIRECTION_FROM_DEVICE);
> +
> +        len -= plen;
> +        addr += plen;
> +        buf += plen;
> +    }
> +
> +    return 0;
> +}
> +
> +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len)
> +{
> +    target_phys_addr_t paddr, plen;
> +    int err;
> +
> +#ifdef DEBUG_IOMMU
> +    fprintf(stderr, "dma_memory_zero context=%p addr=0x" DMA_ADDR_FMT
> +            " len=0x" DMA_ADDR_FMT "\n", dma, addr, len);
> +#endif
> +
> +    while (len) {
> +        err = dma->translate(dma, addr, &paddr, &plen,
> +                             DMA_DIRECTION_FROM_DEVICE);
> +        if (err) {
> +            return err;
> +        }
> +
> +        /* The translation might be valid for larger regions. */
> +        if (plen > len) {
> +            plen = len;
> +        }
> +
> +        cpu_physical_memory_zero(paddr, plen);
> +
> +        len -= plen;
> +        addr += plen;
> +    }
> +
> +    return 0;
> +}
> +
> +typedef struct DMAMemoryMap DMAMemoryMap;
> +struct DMAMemoryMap {
> +    dma_addr_t              addr;
> +    size_t                  len;
> +    void                    *buf;
> +    DMAInvalidateMapFunc    *invalidate;
> +    void                    *invalidate_opaque;
> +
> +    QLIST_ENTRY(DMAMemoryMap) list;
> +};
> +
> +void dma_context_init(DMAContext *dma, DMATranslateFunc fn)
> +{
> +#ifdef DEBUG_IOMMU
> +    fprintf(stderr, "dma_context_init(%p, %p)\n", dma, fn);
> +#endif
> +    dma->translate = fn;
> +    QLIST_INIT(&dma->memory_maps);
> +}
> +
> +void dma_invalidate_memory_range(DMAContext *dma,
> +                                 dma_addr_t addr, dma_addr_t len)
> +{
> +    DMAMemoryMap *map;
> +
> +    QLIST_FOREACH(map, &dma->memory_maps, list) {
> +        if (ranges_overlap(addr, len, map->addr, map->len)) {
> +            map->invalidate(map->invalidate_opaque);
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +
> +void *__dma_memory_map(DMAContext *dma, DMAInvalidateMapFunc *cb,
> +                       void *cb_opaque, dma_addr_t addr, dma_addr_t *len,
> +                       DMADirection dir)
> +{
> +    int err;
> +    target_phys_addr_t paddr, plen;
> +    void *buf;
> +
> +    plen = *len;
> +    err = dma->translate(dma, addr, &paddr, &plen, dir);
> +    if (err) {
> +        return NULL;
> +    }
> +
> +    /*
> +     * If this is true, the virtual region is contiguous,
> +     * but the translated physical region isn't. We just
> +     * clamp *len, much like cpu_physical_memory_map() does.
> +     */
> +    if (plen < *len) {
> +        *len = plen;
> +    }
> +
> +    buf = cpu_physical_memory_map(paddr, &plen,
> +                                  dir == DMA_DIRECTION_FROM_DEVICE);
> +    *len = plen;
> +
> +    /* We treat maps as remote TLBs to cope with stuff like AIO. */
> +    if (cb) {
> +        DMAMemoryMap *map;
> +
> +        map = g_malloc(sizeof(DMAMemoryMap));
> +        map->addr = addr;
> +        map->len = *len;
> +        map->buf = buf;
> +        map->invalidate = cb;
> +        map->invalidate_opaque = cb_opaque;
> +
> +        QLIST_INSERT_HEAD(&dma->memory_maps, map, list);
> +    }
> +
> +    return buf;
> +}
> +
> +void __dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len,
> +                        DMADirection dir, dma_addr_t access_len)
> +{
> +    DMAMemoryMap *map;
> +
> +    cpu_physical_memory_unmap(buffer, len,
> +                              dir == DMA_DIRECTION_FROM_DEVICE,
> +                              access_len);
> +
> +    QLIST_FOREACH(map, &dma->memory_maps, list) {
> +        if ((map->buf == buffer) && (map->len == len)) {
> +            QLIST_REMOVE(map, list);
> +            free(map);
> +        }
> +    }
> +}
> +#endif /* CONFIG_IOMMU */
> diff --git a/dma.h b/dma.h
> index a66e3d7..ec06163 100644
> --- a/dma.h
> +++ b/dma.h
> @@ -15,6 +15,7 @@
>  #include "hw/hw.h"
>  #include "block.h"
>  
> +typedef struct DMAContext DMAContext;
>  typedef struct ScatterGatherEntry ScatterGatherEntry;
>  
>  typedef enum {
> @@ -31,30 +32,82 @@ struct QEMUSGList {
>  };
>  
>  #if defined(TARGET_PHYS_ADDR_BITS)
> +
> +#ifdef CONFIG_IOMMU

... which i making dma_addr_t always 64-bit.  This way you can even
remove the new qdev-dma.c file.  Just make DEFINE_PROP_DMAADDR an alias
for DEFINE_PROP_HEX64.

> +/*
> + * When an IOMMU is present, bus addresses become distinct from
> + * CPU/memory physical addresses and may be a different size.  Because
> + * the IOVA size depends more on the bus than on the platform, we more
> + * or less have to treat these as 64-bit always to cover all (or at
> + * least most) cases.
> + */
> +typedef uint64_t dma_addr_t;
> +
> +#define DMA_ADDR_BITS 64
> +#define DMA_ADDR_FMT "%" PRIx64
> +#else
>  typedef target_phys_addr_t dma_addr_t;
>  
>  #define DMA_ADDR_BITS TARGET_PHYS_ADDR_BITS
>  #define DMA_ADDR_FMT TARGET_FMT_plx
> +#endif
> +
> +typedef int DMATranslateFunc(DMAContext *dma,
> +                             dma_addr_t addr,
> +                             target_phys_addr_t *paddr,
> +                             target_phys_addr_t *len,
> +                             DMADirection dir);
> +
> +typedef struct DMAContext {
> +#ifdef CONFIG_IOMMU
> +    DMATranslateFunc *translate;
> +    QLIST_HEAD(memory_maps, DMAMemoryMap) memory_maps;
> +#endif
> +} DMAContext;
> +
> +#ifdef CONFIG_IOMMU
> +static inline bool dma_has_iommu(DMAContext *dma)
> +{
> +    return !!dma;
> +}
> +#else
> +static inline bool dma_has_iommu(DMAContext *dma)
> +{
> +    return false;
> +}
> +#endif
>  
>  typedef void DMAInvalidateMapFunc(void *);
>  
>  /* Checks that the given range of addresses is valid for DMA.  This is
>   * useful for certain cases, but usually you should just use
>   * dma_memory_{read,write}() and check for errors */
> -static inline bool dma_memory_valid(DMAContext *dma, dma_addr_t addr,
> -                                    dma_addr_t len, DMADirection dir)
> +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> +                        DMADirection dir);
> +static inline bool dma_memory_valid(DMAContext *dma,
> +                                    dma_addr_t addr, dma_addr_t len,
> +                                    DMADirection dir)
>  {
> -    /* Stub version, with no iommu we assume all bus addresses are valid */
> -    return true;
> +    if (!dma_has_iommu(dma)) {
> +        return true;
> +    } else {
> +        return __dma_memory_valid(dma, addr, len, dir);
> +    }
>  }
>  
> +int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
> +                    void *buf, dma_addr_t len, DMADirection dir);
>  static inline int dma_memory_rw(DMAContext *dma, dma_addr_t addr,
>                                  void *buf, dma_addr_t len, DMADirection dir)
>  {
> -    /* Stub version when we have no iommu support */
> -    cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,
> -                           dir == DMA_DIRECTION_FROM_DEVICE);
> -    return 0;
> +    if (!dma_has_iommu(dma)) {
> +        /* Fast-path for no IOMMU */
> +        cpu_physical_memory_rw(addr, buf, len,
> +                               dir == DMA_DIRECTION_FROM_DEVICE);
> +        return 0;
> +    } else {
> +        return __dma_memory_rw(dma, addr, buf, len, dir);
> +    }
>  }
>  
>  static inline int dma_memory_read(DMAContext *dma, dma_addr_t addr,
> @@ -70,35 +123,55 @@ static inline int dma_memory_write(DMAContext *dma, dma_addr_t addr,
>                           DMA_DIRECTION_FROM_DEVICE);
>  }
>  
> +int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len);
>  static inline int dma_memory_zero(DMAContext *dma, dma_addr_t addr,
>                                    dma_addr_t len)
>  {
> -    /* Stub version when we have no iommu support */
> -    cpu_physical_memory_zero(addr, len);
> -    return 0;
> +    if (!dma_has_iommu(dma)) {
> +        /* Fast-path for no IOMMU */
> +        cpu_physical_memory_zero(addr, len);
> +        return 0;
> +    } else {
> +        return __dma_memory_zero(dma, addr, len);
> +    }
>  }
>  
> +void *__dma_memory_map(DMAContext *dma,
> +                       DMAInvalidateMapFunc *cb, void *opaque,
> +                       dma_addr_t addr, dma_addr_t *len,
> +                       DMADirection dir);
>  static inline void *dma_memory_map(DMAContext *dma,
> -                                   DMAInvalidateMapFunc *cb, void *opaque,
> +                                   DMAInvalidateMapFunc *cb, void *cb_opaque,
>                                     dma_addr_t addr, dma_addr_t *len,
>                                     DMADirection dir)
>  {
> -    target_phys_addr_t xlen = *len;
> -    void *p;
> -
> -    p = cpu_physical_memory_map(addr, &xlen,
> -                                dir == DMA_DIRECTION_FROM_DEVICE);
> -    *len = xlen;
> -    return p;
> +    if (!dma_has_iommu(dma)) {
> +        target_phys_addr_t xlen = *len;
> +        void *p;
> +
> +        p = cpu_physical_memory_map(addr, &xlen,
> +                                    dir == DMA_DIRECTION_FROM_DEVICE);
> +        *len = xlen;
> +        return p;
> +    } else {
> +        return __dma_memory_map(dma, cb, cb_opaque, addr, len, dir);
> +    }
>  }
>  
> +void __dma_memory_unmap(DMAContext *dma,
> +                        void *buffer, dma_addr_t len,
> +                        DMADirection dir, dma_addr_t access_len);
>  static inline void dma_memory_unmap(DMAContext *dma,
>                                      void *buffer, dma_addr_t len,
>                                      DMADirection dir, dma_addr_t access_len)
>  {
> -    return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
> -                                     dir == DMA_DIRECTION_FROM_DEVICE,
> -                                     access_len);
> +    if (!dma_has_iommu(dma)) {
> +        return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
> +                                         dir == DMA_DIRECTION_FROM_DEVICE,
> +                                         access_len);
> +    } else {
> +        __dma_memory_unmap(dma, buffer, len, dir, access_len);
> +    }
>  }
>  
>  #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \
> @@ -139,6 +212,14 @@ DEFINE_LDST_DMA(q, q, 64, be);
>  
>  #undef DEFINE_LDST_DMA
>  
> +#ifdef CONFIG_IOMMU
> +
> +void dma_context_init(DMAContext *dma, DMATranslateFunc fn);
> +void dma_invalidate_memory_range(DMAContext *dma,
> +                                 dma_addr_t addr, dma_addr_t len);

Should dma_invalidate_memory_range be changed to a no-op if dma == NULL?
 No idea, just asking.

Paolo

> +
> +#endif /* CONFIG_IOMMU */
> +
>  struct ScatterGatherEntry {
>      dma_addr_t base;
>      dma_addr_t len;
David Gibson - March 13, 2012, 5:07 a.m.
On Fri, Mar 09, 2012 at 11:23:01AM +0100, Paolo Bonzini wrote:
> Il 09/03/2012 06:01, David Gibson ha scritto:
> > This patch adds the basic infrastructure necessary to emulate an IOMMU
> > visible to the guest.  The DMAContext structure is extended with
> > information and a callback describing the translation, and the various
> > DMA functions used by devices will now perform IOMMU translation using
> > this callback.
> > 
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Cc: Richard Henderson <rth@twiddle.net>
> > 
> > Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  configure     |   12 ++++
> >  dma-helpers.c |  191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  dma.h         |  125 +++++++++++++++++++++++++++++++-------
> >  3 files changed, 306 insertions(+), 22 deletions(-)
> > 
> > diff --git a/configure b/configure
> > index a5eb832..e6fba2f 100755
> > --- a/configure
> > +++ b/configure
> > @@ -138,6 +138,7 @@ linux_aio=""
> >  cap_ng=""
> >  attr=""
> >  libattr=""
> > +iommu="yes"
> >  xfs=""
> >  
> >  vhost_net="no"
> > @@ -784,6 +785,10 @@ for opt do
> >    ;;
> >    --enable-vhost-net) vhost_net="yes"
> >    ;;
> > +  --enable-iommu) iommu="yes"
> > +  ;;
> > +  --disable-iommu) iommu="no"
> > +  ;;
> 
> This need not be a configure option.  Just enable it, or it will
> bitrot.

I did wonder about that.  I'd like to hear that suggested by more than
one person before I unconditionally add code which will impose an
overhead on all emulated DMAs.

[snip]
> > +
> > +#ifdef CONFIG_IOMMU
> > +bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
> > +                        DMADirection dir)
> 
> No __ identifiers, please.  You could call these
> dma_context_memory_{valid,read,write}.

Ok, I'll think of different names.

[snip]
> > diff --git a/dma.h b/dma.h
> > index a66e3d7..ec06163 100644
> > --- a/dma.h
> > +++ b/dma.h
> > @@ -15,6 +15,7 @@
> >  #include "hw/hw.h"
> >  #include "block.h"
> >  
> > +typedef struct DMAContext DMAContext;
> >  typedef struct ScatterGatherEntry ScatterGatherEntry;
> >  
> >  typedef enum {
> > @@ -31,30 +32,82 @@ struct QEMUSGList {
> >  };
> >  
> >  #if defined(TARGET_PHYS_ADDR_BITS)
> > +
> > +#ifdef CONFIG_IOMMU
> 
> ... which i making dma_addr_t always 64-bit.  This way you can even
> remove the new qdev-dma.c file.  Just make DEFINE_PROP_DMAADDR an alias
> for DEFINE_PROP_HEX64.

Ah, true, I could #define it to DEFINE_PROP_TADDR before this change,
too.  Ok will, revise accordingly.

[snip]
> > +#ifdef CONFIG_IOMMU
> > +
> > +void dma_context_init(DMAContext *dma, DMATranslateFunc fn);
> > +void dma_invalidate_memory_range(DMAContext *dma,
> > +                                 dma_addr_t addr, dma_addr_t len);
> 
> Should dma_invalidate_memory_range be changed to a no-op if dma == NULL?
>  No idea, just asking.

Not really.  Since dma_invalidate_memory_range() would be called from
the IOMMU implementation when translations are removed, it should
never be called with dma == NULL (if there's IOMMU code to do the
calling, then there must be a non-NULL DMAContext to describe that IOMMU).
Alexander Graf - March 13, 2012, 1:56 p.m.
On 13.03.2012, at 06:07, David Gibson wrote:

> On Fri, Mar 09, 2012 at 11:23:01AM +0100, Paolo Bonzini wrote:
>> Il 09/03/2012 06:01, David Gibson ha scritto:
>>> This patch adds the basic infrastructure necessary to emulate an IOMMU
>>> visible to the guest.  The DMAContext structure is extended with
>>> information and a callback describing the translation, and the various
>>> DMA functions used by devices will now perform IOMMU translation using
>>> this callback.
>>> 
>>> Cc: Michael S. Tsirkin <mst@redhat.com>
>>> Cc: Richard Henderson <rth@twiddle.net>
>>> 
>>> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>> ---
>>> configure     |   12 ++++
>>> dma-helpers.c |  191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>> dma.h         |  125 +++++++++++++++++++++++++++++++-------
>>> 3 files changed, 306 insertions(+), 22 deletions(-)
>>> 
>>> diff --git a/configure b/configure
>>> index a5eb832..e6fba2f 100755
>>> --- a/configure
>>> +++ b/configure
>>> @@ -138,6 +138,7 @@ linux_aio=""
>>> cap_ng=""
>>> attr=""
>>> libattr=""
>>> +iommu="yes"
>>> xfs=""
>>> 
>>> vhost_net="no"
>>> @@ -784,6 +785,10 @@ for opt do
>>>   ;;
>>>   --enable-vhost-net) vhost_net="yes"
>>>   ;;
>>> +  --enable-iommu) iommu="yes"
>>> +  ;;
>>> +  --disable-iommu) iommu="no"
>>> +  ;;
>> 
>> This need not be a configure option.  Just enable it, or it will
>> bitrot.
> 
> I did wonder about that.  I'd like to hear that suggested by more than
> one person before I unconditionally add code which will impose an
> overhead on all emulated DMAs.

Maybe make it a target option? That way targets that want and need an IOMMU can compile the code in, while the others don't see performance hits. That would of course mean that we'd still have conditional code paths, but at least they wouldn't bitrot, as building multiple targets means you'd build both paths.


Alex
David Gibson - March 13, 2012, 2:04 p.m.
On Tue, Mar 13, 2012 at 02:56:47PM +0100, Alexander Graf wrote:
> 
> On 13.03.2012, at 06:07, David Gibson wrote:
> 
> > On Fri, Mar 09, 2012 at 11:23:01AM +0100, Paolo Bonzini wrote:
> >> Il 09/03/2012 06:01, David Gibson ha scritto:
> >>> This patch adds the basic infrastructure necessary to emulate an IOMMU
> >>> visible to the guest.  The DMAContext structure is extended with
> >>> information and a callback describing the translation, and the various
> >>> DMA functions used by devices will now perform IOMMU translation using
> >>> this callback.
> >>> 
> >>> Cc: Michael S. Tsirkin <mst@redhat.com>
> >>> Cc: Richard Henderson <rth@twiddle.net>
> >>> 
> >>> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
> >>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> >>> ---
> >>> configure     |   12 ++++
> >>> dma-helpers.c |  191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>> dma.h         |  125 +++++++++++++++++++++++++++++++-------
> >>> 3 files changed, 306 insertions(+), 22 deletions(-)
> >>> 
> >>> diff --git a/configure b/configure
> >>> index a5eb832..e6fba2f 100755
> >>> --- a/configure
> >>> +++ b/configure
> >>> @@ -138,6 +138,7 @@ linux_aio=""
> >>> cap_ng=""
> >>> attr=""
> >>> libattr=""
> >>> +iommu="yes"
> >>> xfs=""
> >>> 
> >>> vhost_net="no"
> >>> @@ -784,6 +785,10 @@ for opt do
> >>>   ;;
> >>>   --enable-vhost-net) vhost_net="yes"
> >>>   ;;
> >>> +  --enable-iommu) iommu="yes"
> >>> +  ;;
> >>> +  --disable-iommu) iommu="no"
> >>> +  ;;
> >> 
> >> This need not be a configure option.  Just enable it, or it will
> >> bitrot.
> > 
> > I did wonder about that.  I'd like to hear that suggested by more than
> > one person before I unconditionally add code which will impose an
> > overhead on all emulated DMAs.
> 
> Maybe make it a target option? That way targets that want and need
> an IOMMU can compile the code in, while the others don't see
> performance hits. That would of course mean that we'd still have
> conditional code paths, but at least they wouldn't bitrot, as
> building multiple targets means you'd build both paths.

Heh, yeah, so I tried that way back.  It's problematic, unfortunately,
as it would require pulling a number of things out of libhw and back
into target dependent code.
Alexander Graf - March 13, 2012, 2:37 p.m.
On 13.03.2012, at 15:04, David Gibson wrote:

> On Tue, Mar 13, 2012 at 02:56:47PM +0100, Alexander Graf wrote:
>> 
>> On 13.03.2012, at 06:07, David Gibson wrote:
>> 
>>> On Fri, Mar 09, 2012 at 11:23:01AM +0100, Paolo Bonzini wrote:
>>>> Il 09/03/2012 06:01, David Gibson ha scritto:
>>>>> This patch adds the basic infrastructure necessary to emulate an IOMMU
>>>>> visible to the guest.  The DMAContext structure is extended with
>>>>> information and a callback describing the translation, and the various
>>>>> DMA functions used by devices will now perform IOMMU translation using
>>>>> this callback.
>>>>> 
>>>>> Cc: Michael S. Tsirkin <mst@redhat.com>
>>>>> Cc: Richard Henderson <rth@twiddle.net>
>>>>> 
>>>>> Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
>>>>> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
>>>>> ---
>>>>> configure     |   12 ++++
>>>>> dma-helpers.c |  191 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>> dma.h         |  125 +++++++++++++++++++++++++++++++-------
>>>>> 3 files changed, 306 insertions(+), 22 deletions(-)
>>>>> 
>>>>> diff --git a/configure b/configure
>>>>> index a5eb832..e6fba2f 100755
>>>>> --- a/configure
>>>>> +++ b/configure
>>>>> @@ -138,6 +138,7 @@ linux_aio=""
>>>>> cap_ng=""
>>>>> attr=""
>>>>> libattr=""
>>>>> +iommu="yes"
>>>>> xfs=""
>>>>> 
>>>>> vhost_net="no"
>>>>> @@ -784,6 +785,10 @@ for opt do
>>>>>  ;;
>>>>>  --enable-vhost-net) vhost_net="yes"
>>>>>  ;;
>>>>> +  --enable-iommu) iommu="yes"
>>>>> +  ;;
>>>>> +  --disable-iommu) iommu="no"
>>>>> +  ;;
>>>> 
>>>> This need not be a configure option.  Just enable it, or it will
>>>> bitrot.
>>> 
>>> I did wonder about that.  I'd like to hear that suggested by more than
>>> one person before I unconditionally add code which will impose an
>>> overhead on all emulated DMAs.
>> 
>> Maybe make it a target option? That way targets that want and need
>> an IOMMU can compile the code in, while the others don't see
>> performance hits. That would of course mean that we'd still have
>> conditional code paths, but at least they wouldn't bitrot, as
>> building multiple targets means you'd build both paths.
> 
> Heh, yeah, so I tried that way back.  It's problematic, unfortunately,
> as it would require pulling a number of things out of libhw and back
> into target dependent code.

Hrm. How about measuring the performance overhead then? Maybe it's negligible?


Alex
David Gibson - March 14, 2012, 9:05 a.m.
On Tue, Mar 13, 2012 at 03:37:09PM +0100, Alexander Graf wrote:
> On 13.03.2012, at 15:04, David Gibson wrote:
> > On Tue, Mar 13, 2012 at 02:56:47PM +0100, Alexander Graf wrote:
> >> On 13.03.2012, at 06:07, David Gibson wrote:
[snip]
> >>>> This need not be a configure option.  Just enable it, or it will
> >>>> bitrot.
> >>> 
> >>> I did wonder about that.  I'd like to hear that suggested by more than
> >>> one person before I unconditionally add code which will impose an
> >>> overhead on all emulated DMAs.
> >> 
> >> Maybe make it a target option? That way targets that want and need
> >> an IOMMU can compile the code in, while the others don't see
> >> performance hits. That would of course mean that we'd still have
> >> conditional code paths, but at least they wouldn't bitrot, as
> >> building multiple targets means you'd build both paths.
> > 
> > Heh, yeah, so I tried that way back.  It's problematic, unfortunately,
> > as it would require pulling a number of things out of libhw and back
> > into target dependent code.
> 
> Hrm. How about measuring the performance overhead then? Maybe it's
> negligible?

Yeah, I should do that.  Finding a suitable testcase and running it on
a variety of targets is non-trivial, though.

Patch

diff --git a/configure b/configure
index a5eb832..e6fba2f 100755
--- a/configure
+++ b/configure
@@ -138,6 +138,7 @@  linux_aio=""
 cap_ng=""
 attr=""
 libattr=""
+iommu="yes"
 xfs=""
 
 vhost_net="no"
@@ -784,6 +785,10 @@  for opt do
   ;;
   --enable-vhost-net) vhost_net="yes"
   ;;
+  --enable-iommu) iommu="yes"
+  ;;
+  --disable-iommu) iommu="no"
+  ;;
   --disable-opengl) opengl="no"
   ;;
   --enable-opengl) opengl="yes"
@@ -1085,6 +1090,8 @@  echo "  --enable-docs            enable documentation build"
 echo "  --disable-docs           disable documentation build"
 echo "  --disable-vhost-net      disable vhost-net acceleration support"
 echo "  --enable-vhost-net       enable vhost-net acceleration support"
+echo "  --disable-iommu          disable IOMMU emulation support"
+echo "  --enable-iommu           enable IOMMU emulation support"
 echo "  --enable-trace-backend=B Set trace backend"
 echo "                           Available backends:" $("$source_path"/scripts/tracetool --list-backends)
 echo "  --with-trace-file=NAME   Full PATH,NAME of file to store traces"
@@ -2924,6 +2931,7 @@  echo "posix_madvise     $posix_madvise"
 echo "uuid support      $uuid"
 echo "libcap-ng support $cap_ng"
 echo "vhost-net support $vhost_net"
+echo "IOMMU support     $iommu"
 echo "Trace backend     $trace_backend"
 echo "Trace output file $trace_file-<pid>"
 echo "spice support     $spice"
@@ -3801,6 +3809,10 @@  if test "$target_softmmu" = "yes" -a \( \
   echo "CONFIG_NEED_MMU=y" >> $config_target_mak
 fi
 
+if test "$iommu" = "yes" ; then
+  echo "CONFIG_IOMMU=y" >> $config_host_mak
+fi
+
 if test "$gprof" = "yes" ; then
   echo "TARGET_GPROF=yes" >> $config_target_mak
   if test "$target_linux_user" = "yes" ; then
diff --git a/dma-helpers.c b/dma-helpers.c
index 9dcfb2c..248575d 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -10,6 +10,9 @@ 
 #include "dma.h"
 #include "block_int.h"
 #include "trace.h"
+#include "range.h"
+
+/* #define DEBUG_IOMMU */
 
 void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint, DMAContext *dma)
 {
@@ -255,3 +258,191 @@  void dma_acct_start(BlockDriverState *bs, BlockAcctCookie *cookie,
 {
     bdrv_acct_start(bs, cookie, sg->size, type);
 }
+
+#ifdef CONFIG_IOMMU
+bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
+                        DMADirection dir)
+{
+    target_phys_addr_t paddr, plen;
+
+#ifdef DEBUG_IOMMU
+    fprintf(stderr, "dma_memory_check context=%p addr=0x" DMA_ADDR_FMT
+            " len=0x" DMA_ADDR_FMT " dir=%d\n", dma, addr, len, dir);
+#endif
+
+    while (len) {
+        if (dma->translate(dma, addr, &paddr, &plen, dir) != 0) {
+            return false;
+        }
+
+        /* The translation might be valid for larger regions. */
+        if (plen > len) {
+            plen = len;
+        }
+
+        len -= plen;
+        addr += plen;
+    }
+
+    return true;
+}
+
+int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
+                    void *buf, dma_addr_t len, DMADirection dir)
+{
+    target_phys_addr_t paddr, plen;
+    int err;
+
+#ifdef DEBUG_IOMMU
+    fprintf(stderr, "dma_memory_rw context=%p addr=0x" DMA_ADDR_FMT " len=0x"
+            DMA_ADDR_FMT " dir=%d\n", dma, addr, len, dir);
+#endif
+
+    while (len) {
+        err = dma->translate(dma, addr, &paddr, &plen, dir);
+        if (err) {
+            return -1;
+        }
+
+        /* The translation might be valid for larger regions. */
+        if (plen > len) {
+            plen = len;
+        }
+
+        cpu_physical_memory_rw(paddr, buf, plen,
+                               dir == DMA_DIRECTION_FROM_DEVICE);
+
+        len -= plen;
+        addr += plen;
+        buf += plen;
+    }
+
+    return 0;
+}
+
+int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len)
+{
+    target_phys_addr_t paddr, plen;
+    int err;
+
+#ifdef DEBUG_IOMMU
+    fprintf(stderr, "dma_memory_zero context=%p addr=0x" DMA_ADDR_FMT
+            " len=0x" DMA_ADDR_FMT "\n", dma, addr, len);
+#endif
+
+    while (len) {
+        err = dma->translate(dma, addr, &paddr, &plen,
+                             DMA_DIRECTION_FROM_DEVICE);
+        if (err) {
+            return err;
+        }
+
+        /* The translation might be valid for larger regions. */
+        if (plen > len) {
+            plen = len;
+        }
+
+        cpu_physical_memory_zero(paddr, plen);
+
+        len -= plen;
+        addr += plen;
+    }
+
+    return 0;
+}
+
+typedef struct DMAMemoryMap DMAMemoryMap;
+struct DMAMemoryMap {
+    dma_addr_t              addr;
+    size_t                  len;
+    void                    *buf;
+    DMAInvalidateMapFunc    *invalidate;
+    void                    *invalidate_opaque;
+
+    QLIST_ENTRY(DMAMemoryMap) list;
+};
+
+void dma_context_init(DMAContext *dma, DMATranslateFunc fn)
+{
+#ifdef DEBUG_IOMMU
+    fprintf(stderr, "dma_context_init(%p, %p)\n", dma, fn);
+#endif
+    dma->translate = fn;
+    QLIST_INIT(&dma->memory_maps);
+}
+
+void dma_invalidate_memory_range(DMAContext *dma,
+                                 dma_addr_t addr, dma_addr_t len)
+{
+    DMAMemoryMap *map;
+
+    QLIST_FOREACH(map, &dma->memory_maps, list) {
+        if (ranges_overlap(addr, len, map->addr, map->len)) {
+            map->invalidate(map->invalidate_opaque);
+            QLIST_REMOVE(map, list);
+            free(map);
+        }
+    }
+}
+
+void *__dma_memory_map(DMAContext *dma, DMAInvalidateMapFunc *cb,
+                       void *cb_opaque, dma_addr_t addr, dma_addr_t *len,
+                       DMADirection dir)
+{
+    int err;
+    target_phys_addr_t paddr, plen;
+    void *buf;
+
+    plen = *len;
+    err = dma->translate(dma, addr, &paddr, &plen, dir);
+    if (err) {
+        return NULL;
+    }
+
+    /*
+     * If this is true, the virtual region is contiguous,
+     * but the translated physical region isn't. We just
+     * clamp *len, much like cpu_physical_memory_map() does.
+     */
+    if (plen < *len) {
+        *len = plen;
+    }
+
+    buf = cpu_physical_memory_map(paddr, &plen,
+                                  dir == DMA_DIRECTION_FROM_DEVICE);
+    *len = plen;
+
+    /* We treat maps as remote TLBs to cope with stuff like AIO. */
+    if (cb) {
+        DMAMemoryMap *map;
+
+        map = g_malloc(sizeof(DMAMemoryMap));
+        map->addr = addr;
+        map->len = *len;
+        map->buf = buf;
+        map->invalidate = cb;
+        map->invalidate_opaque = cb_opaque;
+
+        QLIST_INSERT_HEAD(&dma->memory_maps, map, list);
+    }
+
+    return buf;
+}
+
+void __dma_memory_unmap(DMAContext *dma, void *buffer, dma_addr_t len,
+                        DMADirection dir, dma_addr_t access_len)
+{
+    DMAMemoryMap *map;
+
+    cpu_physical_memory_unmap(buffer, len,
+                              dir == DMA_DIRECTION_FROM_DEVICE,
+                              access_len);
+
+    QLIST_FOREACH(map, &dma->memory_maps, list) {
+        if ((map->buf == buffer) && (map->len == len)) {
+            QLIST_REMOVE(map, list);
+            free(map);
+        }
+    }
+}
+#endif /* CONFIG_IOMMU */
diff --git a/dma.h b/dma.h
index a66e3d7..ec06163 100644
--- a/dma.h
+++ b/dma.h
@@ -15,6 +15,7 @@ 
 #include "hw/hw.h"
 #include "block.h"
 
+typedef struct DMAContext DMAContext;
 typedef struct ScatterGatherEntry ScatterGatherEntry;
 
 typedef enum {
@@ -31,30 +32,82 @@  struct QEMUSGList {
 };
 
 #if defined(TARGET_PHYS_ADDR_BITS)
+
+#ifdef CONFIG_IOMMU
+/*
+ * When an IOMMU is present, bus addresses become distinct from
+ * CPU/memory physical addresses and may be a different size.  Because
+ * the IOVA size depends more on the bus than on the platform, we more
+ * or less have to treat these as 64-bit always to cover all (or at
+ * least most) cases.
+ */
+typedef uint64_t dma_addr_t;
+
+#define DMA_ADDR_BITS 64
+#define DMA_ADDR_FMT "%" PRIx64
+#else
 typedef target_phys_addr_t dma_addr_t;
 
 #define DMA_ADDR_BITS TARGET_PHYS_ADDR_BITS
 #define DMA_ADDR_FMT TARGET_FMT_plx
+#endif
+
+typedef int DMATranslateFunc(DMAContext *dma,
+                             dma_addr_t addr,
+                             target_phys_addr_t *paddr,
+                             target_phys_addr_t *len,
+                             DMADirection dir);
+
+typedef struct DMAContext {
+#ifdef CONFIG_IOMMU
+    DMATranslateFunc *translate;
+    QLIST_HEAD(memory_maps, DMAMemoryMap) memory_maps;
+#endif
+} DMAContext;
+
+#ifdef CONFIG_IOMMU
+static inline bool dma_has_iommu(DMAContext *dma)
+{
+    return !!dma;
+}
+#else
+static inline bool dma_has_iommu(DMAContext *dma)
+{
+    return false;
+}
+#endif
 
 typedef void DMAInvalidateMapFunc(void *);
 
 /* Checks that the given range of addresses is valid for DMA.  This is
  * useful for certain cases, but usually you should just use
  * dma_memory_{read,write}() and check for errors */
-static inline bool dma_memory_valid(DMAContext *dma, dma_addr_t addr,
-                                    dma_addr_t len, DMADirection dir)
+bool __dma_memory_valid(DMAContext *dma, dma_addr_t addr, dma_addr_t len,
+                        DMADirection dir);
+static inline bool dma_memory_valid(DMAContext *dma,
+                                    dma_addr_t addr, dma_addr_t len,
+                                    DMADirection dir)
 {
-    /* Stub version, with no iommu we assume all bus addresses are valid */
-    return true;
+    if (!dma_has_iommu(dma)) {
+        return true;
+    } else {
+        return __dma_memory_valid(dma, addr, len, dir);
+    }
 }
 
+int __dma_memory_rw(DMAContext *dma, dma_addr_t addr,
+                    void *buf, dma_addr_t len, DMADirection dir);
 static inline int dma_memory_rw(DMAContext *dma, dma_addr_t addr,
                                 void *buf, dma_addr_t len, DMADirection dir)
 {
-    /* Stub version when we have no iommu support */
-    cpu_physical_memory_rw(addr, buf, (target_phys_addr_t)len,
-                           dir == DMA_DIRECTION_FROM_DEVICE);
-    return 0;
+    if (!dma_has_iommu(dma)) {
+        /* Fast-path for no IOMMU */
+        cpu_physical_memory_rw(addr, buf, len,
+                               dir == DMA_DIRECTION_FROM_DEVICE);
+        return 0;
+    } else {
+        return __dma_memory_rw(dma, addr, buf, len, dir);
+    }
 }
 
 static inline int dma_memory_read(DMAContext *dma, dma_addr_t addr,
@@ -70,35 +123,55 @@  static inline int dma_memory_write(DMAContext *dma, dma_addr_t addr,
                          DMA_DIRECTION_FROM_DEVICE);
 }
 
+int __dma_memory_zero(DMAContext *dma, dma_addr_t addr, dma_addr_t len);
 static inline int dma_memory_zero(DMAContext *dma, dma_addr_t addr,
                                   dma_addr_t len)
 {
-    /* Stub version when we have no iommu support */
-    cpu_physical_memory_zero(addr, len);
-    return 0;
+    if (!dma_has_iommu(dma)) {
+        /* Fast-path for no IOMMU */
+        cpu_physical_memory_zero(addr, len);
+        return 0;
+    } else {
+        return __dma_memory_zero(dma, addr, len);
+    }
 }
 
+void *__dma_memory_map(DMAContext *dma,
+                       DMAInvalidateMapFunc *cb, void *opaque,
+                       dma_addr_t addr, dma_addr_t *len,
+                       DMADirection dir);
 static inline void *dma_memory_map(DMAContext *dma,
-                                   DMAInvalidateMapFunc *cb, void *opaque,
+                                   DMAInvalidateMapFunc *cb, void *cb_opaque,
                                    dma_addr_t addr, dma_addr_t *len,
                                    DMADirection dir)
 {
-    target_phys_addr_t xlen = *len;
-    void *p;
-
-    p = cpu_physical_memory_map(addr, &xlen,
-                                dir == DMA_DIRECTION_FROM_DEVICE);
-    *len = xlen;
-    return p;
+    if (!dma_has_iommu(dma)) {
+        target_phys_addr_t xlen = *len;
+        void *p;
+
+        p = cpu_physical_memory_map(addr, &xlen,
+                                    dir == DMA_DIRECTION_FROM_DEVICE);
+        *len = xlen;
+        return p;
+    } else {
+        return __dma_memory_map(dma, cb, cb_opaque, addr, len, dir);
+    }
 }
 
+void __dma_memory_unmap(DMAContext *dma,
+                        void *buffer, dma_addr_t len,
+                        DMADirection dir, dma_addr_t access_len);
 static inline void dma_memory_unmap(DMAContext *dma,
                                     void *buffer, dma_addr_t len,
                                     DMADirection dir, dma_addr_t access_len)
 {
-    return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
-                                     dir == DMA_DIRECTION_FROM_DEVICE,
-                                     access_len);
+    if (!dma_has_iommu(dma)) {
+        return cpu_physical_memory_unmap(buffer, (target_phys_addr_t)len,
+                                         dir == DMA_DIRECTION_FROM_DEVICE,
+                                         access_len);
+    } else {
+        __dma_memory_unmap(dma, buffer, len, dir, access_len);
+    }
 }
 
 #define DEFINE_LDST_DMA(_lname, _sname, _bits, _end) \
@@ -139,6 +212,14 @@  DEFINE_LDST_DMA(q, q, 64, be);
 
 #undef DEFINE_LDST_DMA
 
+#ifdef CONFIG_IOMMU
+
+void dma_context_init(DMAContext *dma, DMATranslateFunc fn);
+void dma_invalidate_memory_range(DMAContext *dma,
+                                 dma_addr_t addr, dma_addr_t len);
+
+#endif /* CONFIG_IOMMU */
+
 struct ScatterGatherEntry {
     dma_addr_t base;
     dma_addr_t len;