Patchwork [v2,3/7] memory: iommu support

login
register
mail settings
Submitter Avi Kivity
Date Oct. 30, 2012, 11:47 a.m.
Message ID <1351597670-23031-4-git-send-email-avi@redhat.com>
Download mbox | patch
Permalink /patch/195467/
State New
Headers show

Comments

Avi Kivity - Oct. 30, 2012, 11:47 a.m.
Add a new memory region type that translates addresses it is given,
then forwards them to a target address space.  This is similar to
an alias, except that the mapping is more flexible than a linear
translation and trucation, and also less efficient since the
translation happens at runtime.

The implementation uses an AddressSpace mapping the target region to
avoid hierarchical dispatch all the way to the resolved region; only
iommu regions are looked up dynamically.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 exec.c   |  28 ++++++++++++++---
 memory.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 memory.h |  45 +++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 4 deletions(-)
Blue Swirl - Oct. 30, 2012, 7:11 p.m.
On Tue, Oct 30, 2012 at 11:47 AM, Avi Kivity <avi@redhat.com> wrote:
> Add a new memory region type that translates addresses it is given,
> then forwards them to a target address space.  This is similar to
> an alias, except that the mapping is more flexible than a linear
> translation and trucation, and also less efficient since the
> translation happens at runtime.
>
> The implementation uses an AddressSpace mapping the target region to
> avoid hierarchical dispatch all the way to the resolved region; only
> iommu regions are looked up dynamically.
>
> Signed-off-by: Avi Kivity <avi@redhat.com>
> ---
>  exec.c   |  28 ++++++++++++++---
>  memory.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  memory.h |  45 +++++++++++++++++++++++++++
>  3 files changed, 175 insertions(+), 4 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index deee8ec..c3914fc 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -3507,6 +3507,7 @@ void cpu_physical_memory_write_rom(hwaddr addr,
>
>  typedef struct {
>      void *buffer;
> +    AddressSpace *as;
>      hwaddr addr;
>      hwaddr len;
>  } BounceBuffer;
> @@ -3572,23 +3573,42 @@ void *address_space_map(AddressSpace *as,
>      ram_addr_t raddr = RAM_ADDR_MAX;
>      ram_addr_t rlen;
>      void *ret;
> +    IOMMUTLBEntry iotlb;
> +    hwaddr xlat;
> +    AddressSpace *as_xlat;
>
>      while (len > 0) {
> +        xlat = addr;
> +        as_xlat = as;
>          page = addr & TARGET_PAGE_MASK;
>          l = (page + TARGET_PAGE_SIZE) - addr;
>          if (l > len)
>              l = len;
>          section = phys_page_find(d, page >> TARGET_PAGE_BITS);
>
> +        while (section->mr->iommu_ops) {
> +            iotlb = section->mr->iommu_ops->translate(section->mr, xlat);
> +            if (iotlb.perm[is_write]) {
> +                xlat = ((iotlb.translated_addr & ~iotlb.addr_mask)
> +                        | (addr & iotlb.addr_mask));
> +                as_xlat = section->mr->iommu_target_as;
> +                l = (MIN(xlat + l - 1, xlat | iotlb.addr_mask) - xlat) + 1;
> +                section = phys_page_find(as_xlat->dispatch, xlat >> TARGET_PAGE_BITS);
> +            } else {
> +                section = &phys_sections[phys_section_unassigned];
> +            }
> +        }
> +
>          if (!(memory_region_is_ram(section->mr) && !section->readonly)) {
>              if (todo || bounce.buffer) {
>                  break;
>              }
>              bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE);
> -            bounce.addr = addr;
> +            bounce.addr = xlat;
> +            bounce.as = as_xlat;
>              bounce.len = l;
>              if (!is_write) {
> -                address_space_read(as, addr, bounce.buffer, l);
> +                address_space_read(as_xlat, xlat, bounce.buffer, l);
>              }
>
>              *plen = l;
> @@ -3596,7 +3616,7 @@ void *address_space_map(AddressSpace *as,
>          }
>          if (!todo) {
>              raddr = memory_region_get_ram_addr(section->mr)
> -                + memory_region_section_addr(section, addr);
> +                + memory_region_section_addr(section, xlat);
>          }
>
>          len -= l;
> @@ -3635,7 +3655,7 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>          return;
>      }
>      if (is_write) {
> -        address_space_write(as, bounce.addr, bounce.buffer, access_len);
> +        address_space_write(bounce.as, bounce.addr, bounce.buffer, access_len);
>      }
>      qemu_vfree(bounce.buffer);
>      bounce.buffer = NULL;
> diff --git a/memory.c b/memory.c
> index ae3552b..ba2d4a0 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -775,6 +775,12 @@ static void memory_region_destructor_rom_device(MemoryRegion *mr)
>      qemu_ram_free(mr->ram_addr & TARGET_PAGE_MASK);
>  }
>
> +static void memory_region_destructor_iommu(MemoryRegion *mr)
> +{
> +    address_space_destroy(mr->iommu_target_as);
> +    g_free(mr->iommu_target_as);
> +}
> +
>  static bool memory_region_wrong_endianness(MemoryRegion *mr)
>  {
>  #ifdef TARGET_WORDS_BIGENDIAN
> @@ -789,6 +795,7 @@ void memory_region_init(MemoryRegion *mr,
>                          uint64_t size)
>  {
>      mr->ops = NULL;
> +    mr->iommu_ops = NULL;
>      mr->parent = NULL;
>      mr->size = int128_make64(size);
>      if (size == UINT64_MAX) {
> @@ -980,6 +987,100 @@ void memory_region_init_rom_device(MemoryRegion *mr,
>      mr->ram_addr = qemu_ram_alloc(size, mr);
>  }
>
> +static void memory_region_iommu_rw(MemoryRegion *iommu, hwaddr addr,
> +                                   uint8_t *buf, unsigned len, bool is_write)
> +{
> +    IOMMUTLBEntry tlb;
> +    unsigned clen;
> +    hwaddr xlat;
> +
> +    while (len) {
> +        tlb = iommu->iommu_ops->translate(iommu, addr);
> +        clen = (MIN(addr | tlb.addr_mask, addr + len - 1) - addr) + 1;
> +        if (tlb.perm[is_write]) {
> +            xlat = (tlb.translated_addr & ~tlb.addr_mask) | (addr & tlb.addr_mask);
> +            address_space_rw(iommu->iommu_target_as, xlat, buf, clen, is_write);
> +        } else {
> +            if (!is_write) {
> +                memset(buf, 0xff, clen);
> +            }
> +        }
> +        buf += clen;
> +        addr += clen;
> +        len -= clen;
> +    }
> +}
> +
> +static uint64_t memory_region_iommu_read(void *opaque, hwaddr addr,
> +                                         unsigned size)
> +{
> +    MemoryRegion *iommu = opaque;
> +    union {
> +        uint8_t buf[8];
> +        uint8_t u8;
> +        uint16_t u16;
> +        uint32_t u32;
> +        uint64_t u64;
> +    } ret;
> +
> +    memory_region_iommu_rw(iommu, addr, ret.buf, size, false);
> +    switch (size) {
> +    case 1: return ret.u8;
> +    case 2: return ret.u16;
> +    case 4: return ret.u32;
> +    case 8: return ret.u64;
> +    default: abort();
> +    }
> +}
> +
> +static void memory_region_iommu_write(void *opaque, hwaddr addr,
> +                                      uint64_t data, unsigned size)
> +{
> +    MemoryRegion *iommu = opaque;
> +    union {
> +        uint8_t buf[8];
> +        uint8_t u8;
> +        uint16_t u16;
> +        uint32_t u32;
> +        uint64_t u64;
> +    } in;
> +
> +    switch (size) {
> +    case 1: in.u8 = data; break;
> +    case 2: in.u16 = data; break;
> +    case 4: in.u32 = data; break;
> +    case 8: in.u64 = data; break;
> +    default: abort();
> +    }
> +    memory_region_iommu_rw(iommu, addr, in.buf, size, true);
> +}
> +
> +static MemoryRegionOps memory_region_iommu_ops = {
> +    .read = memory_region_iommu_read,
> +    .write = memory_region_iommu_write,
> +#ifdef HOST_BIGENDIAN
> +    .endianness = DEVICE_BIG_ENDIAN,

Why couple this with host endianness? I'd expect IOMMU to operate at
target bus endianness, for example LE for PCI on PPC guest.

> +#else
> +    .endianness = DEVICE_LITTLE_ENDIAN,
> +#endif
> +};
> +
> +void memory_region_init_iommu(MemoryRegion *mr,
> +                              MemoryRegionIOMMUOps *ops,
> +                              MemoryRegion *target,
> +                              const char *name,
> +                              uint64_t size)
> +{
> +    memory_region_init(mr, name, size);
> +    mr->ops = &memory_region_iommu_ops;
> +    mr->iommu_ops = ops,
> +    mr->opaque = mr;
> +    mr->terminates = true;  /* then re-forwards */
> +    mr->destructor = memory_region_destructor_iommu;
> +    mr->iommu_target_as = g_new(AddressSpace, 1);
> +    address_space_init(mr->iommu_target_as, target);
> +}
> +
>  static uint64_t invalid_read(void *opaque, hwaddr addr,
>                               unsigned size)
>  {
> @@ -1054,6 +1155,11 @@ bool memory_region_is_rom(MemoryRegion *mr)
>      return mr->ram && mr->readonly;
>  }
>
> +bool memory_region_is_iommu(MemoryRegion *mr)
> +{
> +    return mr->iommu_ops;
> +}
> +
>  void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
>  {
>      uint8_t mask = 1 << client;
> diff --git a/memory.h b/memory.h
> index 9462bfd..47362c9 100644
> --- a/memory.h
> +++ b/memory.h
> @@ -113,12 +113,28 @@ struct MemoryRegionOps {
>      const MemoryRegionMmio old_mmio;
>  };
>
> +typedef struct IOMMUTLBEntry IOMMUTLBEntry;
> +typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
> +
> +struct IOMMUTLBEntry {
> +    hwaddr device_addr;
> +    hwaddr translated_addr;
> +    hwaddr addr_mask;  /* 0xfff = 4k translation */
> +    bool perm[2]; /* read/write permissions */

Please document that bit 1 is write and 0 read.

> +};
> +
> +struct MemoryRegionIOMMUOps {
> +    /* Returns a TLB entry that contains a given address. */
> +    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr);

Maybe the MemoryRegion could be const to declare that the translation
does not change mappings.

> +};
> +
>  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
>  typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
>
>  struct MemoryRegion {
>      /* All fields are private - violators will be prosecuted */
>      const MemoryRegionOps *ops;
> +    const MemoryRegionIOMMUOps *iommu_ops;
>      void *opaque;
>      MemoryRegion *parent;
>      Int128 size;
> @@ -145,6 +161,7 @@ struct MemoryRegion {
>      uint8_t dirty_log_mask;
>      unsigned ioeventfd_nb;
>      MemoryRegionIoeventfd *ioeventfds;
> +    struct AddressSpace *iommu_target_as;
>  };
>
>  struct MemoryRegionPortio {
> @@ -334,6 +351,25 @@ void memory_region_init_rom_device(MemoryRegion *mr,
>  void memory_region_init_reservation(MemoryRegion *mr,
>                                      const char *name,
>                                      uint64_t size);
> +
> +/**
> + * memory_region_init_iommu: Initialize a memory region that translates addresses
> + *
> + * An IOMMU region translates addresses and forwards accesses to a target memory region.
> + *
> + * @mr: the #MemoryRegion to be initialized
> + * @ops: a function that translates addresses into the @target region
> + * @target: a #MemoryRegion that will be used to satisfy accesses to translated
> + *          addresses
> + * @name: used for debugging; not visible to the user or ABI
> + * @size: size of the region.
> + */
> +void memory_region_init_iommu(MemoryRegion *mr,
> +                              MemoryRegionIOMMUOps *ops,
> +                              MemoryRegion *target,
> +                              const char *name,
> +                              uint64_t size);
> +
>  /**
>   * memory_region_destroy: Destroy a memory region and reclaim all resources.
>   *
> @@ -373,6 +409,15 @@ static inline bool memory_region_is_romd(MemoryRegion *mr)
>  }
>
>  /**
> + * memory_region_is_ram: check whether a memory region is an iommu
> + *
> + * Returns %true is a memory region is an iommu.
> + *
> + * @mr: the memory region being queried
> + */
> +bool memory_region_is_iommu(MemoryRegion *mr);
> +
> +/**
>   * memory_region_name: get a memory region's name
>   *
>   * Returns the string that was used to initialize the memory region.
> --
> 1.7.12
>
Benjamin Herrenschmidt - Oct. 30, 2012, 8:03 p.m.
On Tue, 2012-10-30 at 19:11 +0000, Blue Swirl wrote:

> Why couple this with host endianness? I'd expect IOMMU to operate at
> target bus endianness, for example LE for PCI on PPC guest.

I'm not sure about putting the iommu "in charge" of endianness ...

On one hand it's fishy. It should be 'transparent', the device is what
controls its own endianness and playing games at the iommu level is
going to result into tears... on the other hand I can see the appeal of
not bothering at the device level and letting the iommu do it for you...
but I still think it's risky.

Besides not all PCI devices are little endian :-) Also that won't deal
with map/unmap etc...

So I'd vote for sanity here and make the iommu not affect endianness in
any way and let the devices do the "right thing" as is the expectation
today.

Ben.

> > +#else
> > +    .endianness = DEVICE_LITTLE_ENDIAN,
> > +#endif
> > +};
> > +
> > +void memory_region_init_iommu(MemoryRegion *mr,
> > +                              MemoryRegionIOMMUOps *ops,
> > +                              MemoryRegion *target,
> > +                              const char *name,
> > +                              uint64_t size)
> > +{
> > +    memory_region_init(mr, name, size);
> > +    mr->ops = &memory_region_iommu_ops;
> > +    mr->iommu_ops = ops,
> > +    mr->opaque = mr;
> > +    mr->terminates = true;  /* then re-forwards */
> > +    mr->destructor = memory_region_destructor_iommu;
> > +    mr->iommu_target_as = g_new(AddressSpace, 1);
> > +    address_space_init(mr->iommu_target_as, target);
> > +}
> > +
> >  static uint64_t invalid_read(void *opaque, hwaddr addr,
> >                               unsigned size)
> >  {
> > @@ -1054,6 +1155,11 @@ bool memory_region_is_rom(MemoryRegion *mr)
> >      return mr->ram && mr->readonly;
> >  }
> >
> > +bool memory_region_is_iommu(MemoryRegion *mr)
> > +{
> > +    return mr->iommu_ops;
> > +}
> > +
> >  void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
> >  {
> >      uint8_t mask = 1 << client;
> > diff --git a/memory.h b/memory.h
> > index 9462bfd..47362c9 100644
> > --- a/memory.h
> > +++ b/memory.h
> > @@ -113,12 +113,28 @@ struct MemoryRegionOps {
> >      const MemoryRegionMmio old_mmio;
> >  };
> >
> > +typedef struct IOMMUTLBEntry IOMMUTLBEntry;
> > +typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
> > +
> > +struct IOMMUTLBEntry {
> > +    hwaddr device_addr;
> > +    hwaddr translated_addr;
> > +    hwaddr addr_mask;  /* 0xfff = 4k translation */
> > +    bool perm[2]; /* read/write permissions */
> 
> Please document that bit 1 is write and 0 read.
> 
> > +};
> > +
> > +struct MemoryRegionIOMMUOps {
> > +    /* Returns a TLB entry that contains a given address. */
> > +    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr);
> 
> Maybe the MemoryRegion could be const to declare that the translation
> does not change mappings.
> 
> > +};
> > +
> >  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> >  typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
> >
> >  struct MemoryRegion {
> >      /* All fields are private - violators will be prosecuted */
> >      const MemoryRegionOps *ops;
> > +    const MemoryRegionIOMMUOps *iommu_ops;
> >      void *opaque;
> >      MemoryRegion *parent;
> >      Int128 size;
> > @@ -145,6 +161,7 @@ struct MemoryRegion {
> >      uint8_t dirty_log_mask;
> >      unsigned ioeventfd_nb;
> >      MemoryRegionIoeventfd *ioeventfds;
> > +    struct AddressSpace *iommu_target_as;
> >  };
> >
> >  struct MemoryRegionPortio {
> > @@ -334,6 +351,25 @@ void memory_region_init_rom_device(MemoryRegion *mr,
> >  void memory_region_init_reservation(MemoryRegion *mr,
> >                                      const char *name,
> >                                      uint64_t size);
> > +
> > +/**
> > + * memory_region_init_iommu: Initialize a memory region that translates addresses
> > + *
> > + * An IOMMU region translates addresses and forwards accesses to a target memory region.
> > + *
> > + * @mr: the #MemoryRegion to be initialized
> > + * @ops: a function that translates addresses into the @target region
> > + * @target: a #MemoryRegion that will be used to satisfy accesses to translated
> > + *          addresses
> > + * @name: used for debugging; not visible to the user or ABI
> > + * @size: size of the region.
> > + */
> > +void memory_region_init_iommu(MemoryRegion *mr,
> > +                              MemoryRegionIOMMUOps *ops,
> > +                              MemoryRegion *target,
> > +                              const char *name,
> > +                              uint64_t size);
> > +
> >  /**
> >   * memory_region_destroy: Destroy a memory region and reclaim all resources.
> >   *
> > @@ -373,6 +409,15 @@ static inline bool memory_region_is_romd(MemoryRegion *mr)
> >  }
> >
> >  /**
> > + * memory_region_is_ram: check whether a memory region is an iommu
> > + *
> > + * Returns %true is a memory region is an iommu.
> > + *
> > + * @mr: the memory region being queried
> > + */
> > +bool memory_region_is_iommu(MemoryRegion *mr);
> > +
> > +/**
> >   * memory_region_name: get a memory region's name
> >   *
> >   * Returns the string that was used to initialize the memory region.
> > --
> > 1.7.12
> >
Blue Swirl - Oct. 30, 2012, 9:13 p.m.
On Tue, Oct 30, 2012 at 8:03 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Tue, 2012-10-30 at 19:11 +0000, Blue Swirl wrote:
>
>> Why couple this with host endianness? I'd expect IOMMU to operate at
>> target bus endianness, for example LE for PCI on PPC guest.
>
> I'm not sure about putting the iommu "in charge" of endianness ...
>
> On one hand it's fishy. It should be 'transparent', the device is what
> controls its own endianness and playing games at the iommu level is
> going to result into tears... on the other hand I can see the appeal of
> not bothering at the device level and letting the iommu do it for you...
> but I still think it's risky.
>
> Besides not all PCI devices are little endian :-) Also that won't deal
> with map/unmap etc...
>
> So I'd vote for sanity here and make the iommu not affect endianness in
> any way and let the devices do the "right thing" as is the expectation
> today.

Yes. Does the IOMMU even need to touch the data bus at all, because it
only translates address bus bits (ignoring caches/TLBs)?

>
> Ben.
>
>> > +#else
>> > +    .endianness = DEVICE_LITTLE_ENDIAN,
>> > +#endif
>> > +};
>> > +
>> > +void memory_region_init_iommu(MemoryRegion *mr,
>> > +                              MemoryRegionIOMMUOps *ops,
>> > +                              MemoryRegion *target,
>> > +                              const char *name,
>> > +                              uint64_t size)
>> > +{
>> > +    memory_region_init(mr, name, size);
>> > +    mr->ops = &memory_region_iommu_ops;
>> > +    mr->iommu_ops = ops,
>> > +    mr->opaque = mr;
>> > +    mr->terminates = true;  /* then re-forwards */
>> > +    mr->destructor = memory_region_destructor_iommu;
>> > +    mr->iommu_target_as = g_new(AddressSpace, 1);
>> > +    address_space_init(mr->iommu_target_as, target);
>> > +}
>> > +
>> >  static uint64_t invalid_read(void *opaque, hwaddr addr,
>> >                               unsigned size)
>> >  {
>> > @@ -1054,6 +1155,11 @@ bool memory_region_is_rom(MemoryRegion *mr)
>> >      return mr->ram && mr->readonly;
>> >  }
>> >
>> > +bool memory_region_is_iommu(MemoryRegion *mr)
>> > +{
>> > +    return mr->iommu_ops;
>> > +}
>> > +
>> >  void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
>> >  {
>> >      uint8_t mask = 1 << client;
>> > diff --git a/memory.h b/memory.h
>> > index 9462bfd..47362c9 100644
>> > --- a/memory.h
>> > +++ b/memory.h
>> > @@ -113,12 +113,28 @@ struct MemoryRegionOps {
>> >      const MemoryRegionMmio old_mmio;
>> >  };
>> >
>> > +typedef struct IOMMUTLBEntry IOMMUTLBEntry;
>> > +typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
>> > +
>> > +struct IOMMUTLBEntry {
>> > +    hwaddr device_addr;
>> > +    hwaddr translated_addr;
>> > +    hwaddr addr_mask;  /* 0xfff = 4k translation */
>> > +    bool perm[2]; /* read/write permissions */
>>
>> Please document that bit 1 is write and 0 read.
>>
>> > +};
>> > +
>> > +struct MemoryRegionIOMMUOps {
>> > +    /* Returns a TLB entry that contains a given address. */
>> > +    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr);
>>
>> Maybe the MemoryRegion could be const to declare that the translation
>> does not change mappings.
>>
>> > +};
>> > +
>> >  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
>> >  typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
>> >
>> >  struct MemoryRegion {
>> >      /* All fields are private - violators will be prosecuted */
>> >      const MemoryRegionOps *ops;
>> > +    const MemoryRegionIOMMUOps *iommu_ops;
>> >      void *opaque;
>> >      MemoryRegion *parent;
>> >      Int128 size;
>> > @@ -145,6 +161,7 @@ struct MemoryRegion {
>> >      uint8_t dirty_log_mask;
>> >      unsigned ioeventfd_nb;
>> >      MemoryRegionIoeventfd *ioeventfds;
>> > +    struct AddressSpace *iommu_target_as;
>> >  };
>> >
>> >  struct MemoryRegionPortio {
>> > @@ -334,6 +351,25 @@ void memory_region_init_rom_device(MemoryRegion *mr,
>> >  void memory_region_init_reservation(MemoryRegion *mr,
>> >                                      const char *name,
>> >                                      uint64_t size);
>> > +
>> > +/**
>> > + * memory_region_init_iommu: Initialize a memory region that translates addresses
>> > + *
>> > + * An IOMMU region translates addresses and forwards accesses to a target memory region.
>> > + *
>> > + * @mr: the #MemoryRegion to be initialized
>> > + * @ops: a function that translates addresses into the @target region
>> > + * @target: a #MemoryRegion that will be used to satisfy accesses to translated
>> > + *          addresses
>> > + * @name: used for debugging; not visible to the user or ABI
>> > + * @size: size of the region.
>> > + */
>> > +void memory_region_init_iommu(MemoryRegion *mr,
>> > +                              MemoryRegionIOMMUOps *ops,
>> > +                              MemoryRegion *target,
>> > +                              const char *name,
>> > +                              uint64_t size);
>> > +
>> >  /**
>> >   * memory_region_destroy: Destroy a memory region and reclaim all resources.
>> >   *
>> > @@ -373,6 +409,15 @@ static inline bool memory_region_is_romd(MemoryRegion *mr)
>> >  }
>> >
>> >  /**
>> > + * memory_region_is_ram: check whether a memory region is an iommu
>> > + *
>> > + * Returns %true is a memory region is an iommu.
>> > + *
>> > + * @mr: the memory region being queried
>> > + */
>> > +bool memory_region_is_iommu(MemoryRegion *mr);
>> > +
>> > +/**
>> >   * memory_region_name: get a memory region's name
>> >   *
>> >   * Returns the string that was used to initialize the memory region.
>> > --
>> > 1.7.12
>> >
>
>
Avi Kivity - Oct. 31, 2012, 10:32 a.m.
On 10/30/2012 09:11 PM, Blue Swirl wrote:
> On Tue, Oct 30, 2012 at 11:47 AM, Avi Kivity <avi@redhat.com> wrote:
>> Add a new memory region type that translates addresses it is given,
>> then forwards them to a target address space.  This is similar to
>> an alias, except that the mapping is more flexible than a linear
>> translation and trucation, and also less efficient since the
>> translation happens at runtime.
>>
>> The implementation uses an AddressSpace mapping the target region to
>> avoid hierarchical dispatch all the way to the resolved region; only
>> iommu regions are looked up dynamically.
>>
>> +
>> +static MemoryRegionOps memory_region_iommu_ops = {
>> +    .read = memory_region_iommu_read,
>> +    .write = memory_region_iommu_write,
>> +#ifdef HOST_BIGENDIAN
>> +    .endianness = DEVICE_BIG_ENDIAN,
> 
> Why couple this with host endianness? I'd expect IOMMU to operate at
> target bus endianness, for example LE for PCI on PPC guest.

This has nothing to do with device endianness; we're translating from a
byte buffer (address_space_rw()) to a uint64_t
(MemoryRegionOps::read/write()) so we need to take host endianess into
account.

This code cleverly makes use of memory core endian support to do the
conversion for us.  But it's probably too clever and should be replaced
by an explicit call to a helper.
Benjamin Herrenschmidt - Oct. 31, 2012, 6:59 p.m.
On Wed, 2012-10-31 at 12:32 +0200, Avi Kivity wrote:
> This has nothing to do with device endianness; we're translating from a
> byte buffer (address_space_rw()) to a uint64_t
> (MemoryRegionOps::read/write()) so we need to take host endianess into
> account.
> 
> This code cleverly makes use of memory core endian support to do the
> conversion for us.  But it's probably too clever and should be replaced
> by an explicit call to a helper.

Well, the whole idea of providing sized-type accessors (u8/u16/...) is
very fishy for DMA. I understand it comes from the MemoryRegion ops, and
It made somewhat sense in CPU to device (though even then endianness had
always gotten wrong) but for DMA it's going to be a can of worms.

Taking "host endianness" into account means nothing if you don't define
what endianness those are expected to use to write to memory. If you add
yet another case of "guest endianness" in the mix, then you make yet
another mistake akin to the virtio trainwreck since things like powerpc
or ARM can operate in different endianness.

The best thing to do is to forbid use of those functions :-) And
basically mandate the use of endian explicit accessor that specifically
indicate what endianness the value should have once written in target
memory. The most sensible expectation when using the "raw" Memory ops is
to have the value go "as is" ie in host endianness.

Cheers,
Ben.
Avi Kivity - Nov. 1, 2012, 1:44 p.m.
On 10/31/2012 08:59 PM, Benjamin Herrenschmidt wrote:
> On Wed, 2012-10-31 at 12:32 +0200, Avi Kivity wrote:
>> This has nothing to do with device endianness; we're translating from a
>> byte buffer (address_space_rw()) to a uint64_t
>> (MemoryRegionOps::read/write()) so we need to take host endianess into
>> account.
>> 
>> This code cleverly makes use of memory core endian support to do the
>> conversion for us.  But it's probably too clever and should be replaced
>> by an explicit call to a helper.
> 
> Well, the whole idea of providing sized-type accessors (u8/u16/...) is
> very fishy for DMA. I understand it comes from the MemoryRegion ops, and
> It made somewhat sense in CPU to device (though even then endianness had
> always gotten wrong) but for DMA it's going to be a can of worms.

Endianness here has no effect on the result.  An address_space_rw()
causes a lookup of a memory region, which happens to be an iommu memory
region.  Because of the way MemoryRegionOps are done, it is converted to
a 1/2/4/8 accessor, and then converted immediately back to a byte array.
 As long as we're consistent there's no change to the data path.

However we do have a problem with non-1/2/4/8 byte writes.  Right now
any mismatched access ends up as an 8 byte write, we need an extra
accessor for arbitrary writes, or rather better use of the .impl members
of MemoryRegionOps.

> 
> Taking "host endianness" into account means nothing if you don't define
> what endianness those are expected to use to write to memory. If you add
> yet another case of "guest endianness" in the mix, then you make yet
> another mistake akin to the virtio trainwreck since things like powerpc
> or ARM can operate in different endianness.
> 
> The best thing to do is to forbid use of those functions :-) And
> basically mandate the use of endian explicit accessor that specifically
> indicate what endianness the value should have once written in target
> memory. The most sensible expectation when using the "raw" Memory ops is
> to have the value go "as is" ie in host endianness.
Avi Kivity - Nov. 1, 2012, 1:45 p.m.
On 11/01/2012 03:44 PM, Avi Kivity wrote:
> 
> However we do have a problem with non-1/2/4/8 byte writes.  Right now
> any mismatched access ends up as an 8 byte write, we need an extra
> accessor for arbitrary writes, or rather better use of the .impl members
> of MemoryRegionOps.

Sorry, it's converted into a series of 8-bit writes.  So the code is
correct now, if inefficient.

Patch

diff --git a/exec.c b/exec.c
index deee8ec..c3914fc 100644
--- a/exec.c
+++ b/exec.c
@@ -3507,6 +3507,7 @@  void cpu_physical_memory_write_rom(hwaddr addr,
 
 typedef struct {
     void *buffer;
+    AddressSpace *as;
     hwaddr addr;
     hwaddr len;
 } BounceBuffer;
@@ -3572,23 +3573,42 @@  void *address_space_map(AddressSpace *as,
     ram_addr_t raddr = RAM_ADDR_MAX;
     ram_addr_t rlen;
     void *ret;
+    IOMMUTLBEntry iotlb;
+    hwaddr xlat;
+    AddressSpace *as_xlat;
 
     while (len > 0) {
+        xlat = addr;
+        as_xlat = as;
         page = addr & TARGET_PAGE_MASK;
         l = (page + TARGET_PAGE_SIZE) - addr;
         if (l > len)
             l = len;
         section = phys_page_find(d, page >> TARGET_PAGE_BITS);
 
+        while (section->mr->iommu_ops) {
+            iotlb = section->mr->iommu_ops->translate(section->mr, xlat);
+            if (iotlb.perm[is_write]) {
+                xlat = ((iotlb.translated_addr & ~iotlb.addr_mask)
+                        | (addr & iotlb.addr_mask));
+                as_xlat = section->mr->iommu_target_as;
+                l = (MIN(xlat + l - 1, xlat | iotlb.addr_mask) - xlat) + 1;
+                section = phys_page_find(as_xlat->dispatch, xlat >> TARGET_PAGE_BITS);
+            } else {
+                section = &phys_sections[phys_section_unassigned];
+            }
+        }
+
         if (!(memory_region_is_ram(section->mr) && !section->readonly)) {
             if (todo || bounce.buffer) {
                 break;
             }
             bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, TARGET_PAGE_SIZE);
-            bounce.addr = addr;
+            bounce.addr = xlat;
+            bounce.as = as_xlat;
             bounce.len = l;
             if (!is_write) {
-                address_space_read(as, addr, bounce.buffer, l);
+                address_space_read(as_xlat, xlat, bounce.buffer, l);
             }
 
             *plen = l;
@@ -3596,7 +3616,7 @@  void *address_space_map(AddressSpace *as,
         }
         if (!todo) {
             raddr = memory_region_get_ram_addr(section->mr)
-                + memory_region_section_addr(section, addr);
+                + memory_region_section_addr(section, xlat);
         }
 
         len -= l;
@@ -3635,7 +3655,7 @@  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
         return;
     }
     if (is_write) {
-        address_space_write(as, bounce.addr, bounce.buffer, access_len);
+        address_space_write(bounce.as, bounce.addr, bounce.buffer, access_len);
     }
     qemu_vfree(bounce.buffer);
     bounce.buffer = NULL;
diff --git a/memory.c b/memory.c
index ae3552b..ba2d4a0 100644
--- a/memory.c
+++ b/memory.c
@@ -775,6 +775,12 @@  static void memory_region_destructor_rom_device(MemoryRegion *mr)
     qemu_ram_free(mr->ram_addr & TARGET_PAGE_MASK);
 }
 
+static void memory_region_destructor_iommu(MemoryRegion *mr)
+{
+    address_space_destroy(mr->iommu_target_as);
+    g_free(mr->iommu_target_as);
+}
+
 static bool memory_region_wrong_endianness(MemoryRegion *mr)
 {
 #ifdef TARGET_WORDS_BIGENDIAN
@@ -789,6 +795,7 @@  void memory_region_init(MemoryRegion *mr,
                         uint64_t size)
 {
     mr->ops = NULL;
+    mr->iommu_ops = NULL;
     mr->parent = NULL;
     mr->size = int128_make64(size);
     if (size == UINT64_MAX) {
@@ -980,6 +987,100 @@  void memory_region_init_rom_device(MemoryRegion *mr,
     mr->ram_addr = qemu_ram_alloc(size, mr);
 }
 
+static void memory_region_iommu_rw(MemoryRegion *iommu, hwaddr addr,
+                                   uint8_t *buf, unsigned len, bool is_write)
+{
+    IOMMUTLBEntry tlb;
+    unsigned clen;
+    hwaddr xlat;
+
+    while (len) {
+        tlb = iommu->iommu_ops->translate(iommu, addr);
+        clen = (MIN(addr | tlb.addr_mask, addr + len - 1) - addr) + 1;
+        if (tlb.perm[is_write]) {
+            xlat = (tlb.translated_addr & ~tlb.addr_mask) | (addr & tlb.addr_mask);
+            address_space_rw(iommu->iommu_target_as, xlat, buf, clen, is_write);
+        } else {
+            if (!is_write) {
+                memset(buf, 0xff, clen);
+            }
+        }
+        buf += clen;
+        addr += clen;
+        len -= clen;
+    }
+}
+
+static uint64_t memory_region_iommu_read(void *opaque, hwaddr addr,
+                                         unsigned size)
+{
+    MemoryRegion *iommu = opaque;
+    union {
+        uint8_t buf[8];
+        uint8_t u8;
+        uint16_t u16;
+        uint32_t u32;
+        uint64_t u64;
+    } ret;
+
+    memory_region_iommu_rw(iommu, addr, ret.buf, size, false);
+    switch (size) {
+    case 1: return ret.u8;
+    case 2: return ret.u16;
+    case 4: return ret.u32;
+    case 8: return ret.u64;
+    default: abort();
+    }
+}
+
+static void memory_region_iommu_write(void *opaque, hwaddr addr,
+                                      uint64_t data, unsigned size)
+{
+    MemoryRegion *iommu = opaque;
+    union {
+        uint8_t buf[8];
+        uint8_t u8;
+        uint16_t u16;
+        uint32_t u32;
+        uint64_t u64;
+    } in;
+
+    switch (size) {
+    case 1: in.u8 = data; break;
+    case 2: in.u16 = data; break;
+    case 4: in.u32 = data; break;
+    case 8: in.u64 = data; break;
+    default: abort();
+    }
+    memory_region_iommu_rw(iommu, addr, in.buf, size, true);
+}
+
+static MemoryRegionOps memory_region_iommu_ops = {
+    .read = memory_region_iommu_read,
+    .write = memory_region_iommu_write,
+#ifdef HOST_BIGENDIAN
+    .endianness = DEVICE_BIG_ENDIAN,
+#else
+    .endianness = DEVICE_LITTLE_ENDIAN,
+#endif
+};
+
+void memory_region_init_iommu(MemoryRegion *mr,
+                              MemoryRegionIOMMUOps *ops,
+                              MemoryRegion *target,
+                              const char *name,
+                              uint64_t size)
+{
+    memory_region_init(mr, name, size);
+    mr->ops = &memory_region_iommu_ops;
+    mr->iommu_ops = ops,
+    mr->opaque = mr;
+    mr->terminates = true;  /* then re-forwards */
+    mr->destructor = memory_region_destructor_iommu;
+    mr->iommu_target_as = g_new(AddressSpace, 1);
+    address_space_init(mr->iommu_target_as, target);
+}
+
 static uint64_t invalid_read(void *opaque, hwaddr addr,
                              unsigned size)
 {
@@ -1054,6 +1155,11 @@  bool memory_region_is_rom(MemoryRegion *mr)
     return mr->ram && mr->readonly;
 }
 
+bool memory_region_is_iommu(MemoryRegion *mr)
+{
+    return mr->iommu_ops;
+}
+
 void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
 {
     uint8_t mask = 1 << client;
diff --git a/memory.h b/memory.h
index 9462bfd..47362c9 100644
--- a/memory.h
+++ b/memory.h
@@ -113,12 +113,28 @@  struct MemoryRegionOps {
     const MemoryRegionMmio old_mmio;
 };
 
+typedef struct IOMMUTLBEntry IOMMUTLBEntry;
+typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
+
+struct IOMMUTLBEntry {
+    hwaddr device_addr;
+    hwaddr translated_addr;
+    hwaddr addr_mask;  /* 0xfff = 4k translation */
+    bool perm[2]; /* read/write permissions */
+};
+
+struct MemoryRegionIOMMUOps {
+    /* Returns a TLB entry that contains a given address. */
+    IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr);
+};
+
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
 typedef struct MemoryRegionIoeventfd MemoryRegionIoeventfd;
 
 struct MemoryRegion {
     /* All fields are private - violators will be prosecuted */
     const MemoryRegionOps *ops;
+    const MemoryRegionIOMMUOps *iommu_ops;
     void *opaque;
     MemoryRegion *parent;
     Int128 size;
@@ -145,6 +161,7 @@  struct MemoryRegion {
     uint8_t dirty_log_mask;
     unsigned ioeventfd_nb;
     MemoryRegionIoeventfd *ioeventfds;
+    struct AddressSpace *iommu_target_as;
 };
 
 struct MemoryRegionPortio {
@@ -334,6 +351,25 @@  void memory_region_init_rom_device(MemoryRegion *mr,
 void memory_region_init_reservation(MemoryRegion *mr,
                                     const char *name,
                                     uint64_t size);
+
+/**
+ * memory_region_init_iommu: Initialize a memory region that translates addresses
+ *
+ * An IOMMU region translates addresses and forwards accesses to a target memory region.
+ *
+ * @mr: the #MemoryRegion to be initialized
+ * @ops: a function that translates addresses into the @target region
+ * @target: a #MemoryRegion that will be used to satisfy accesses to translated
+ *          addresses
+ * @name: used for debugging; not visible to the user or ABI
+ * @size: size of the region.
+ */
+void memory_region_init_iommu(MemoryRegion *mr,
+                              MemoryRegionIOMMUOps *ops,
+                              MemoryRegion *target,
+                              const char *name,
+                              uint64_t size);
+
 /**
  * memory_region_destroy: Destroy a memory region and reclaim all resources.
  *
@@ -373,6 +409,15 @@  static inline bool memory_region_is_romd(MemoryRegion *mr)
 }
 
 /**
+ * memory_region_is_ram: check whether a memory region is an iommu
+ *
+ * Returns %true is a memory region is an iommu.
+ *
+ * @mr: the memory region being queried
+ */
+bool memory_region_is_iommu(MemoryRegion *mr);
+
+/**
  * memory_region_name: get a memory region's name
  *
  * Returns the string that was used to initialize the memory region.