diff mbox

[RFC,6/8] memory: introduce AddressSpaceOps

Message ID 1493285660-4470-7-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu April 27, 2017, 9:34 a.m. UTC
This is something similar to MemoryRegionOps, it's just for address
spaces to store arch-specific hooks.

The first hook I would like to introduce is iommu_get().

For systems that have IOMMUs, we will create a special address space per
device which is different from system default address space for
it (please refer to pci_device_iommu_address_space()). Normally when
that happens, there will be one specific IOMMU (or say, translation
unit) stands right behind that new address space.

This iommu_get() fetches that guy behind the address space. Here, the
guy is defined as IOMMUObject, which is currently a (void *). In the
future, maybe we can make it a better definition, but imho it's good
enough for now, considering it's arch-dependent.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/exec/memory.h | 30 ++++++++++++++++++++++++++++++
 memory.c              |  8 ++++++++
 2 files changed, 38 insertions(+)

Comments

David Gibson May 1, 2017, 4:58 a.m. UTC | #1
On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> This is something similar to MemoryRegionOps, it's just for address
> spaces to store arch-specific hooks.
> 
> The first hook I would like to introduce is iommu_get().
> 
> For systems that have IOMMUs, we will create a special address space per
> device which is different from system default address space for
> it (please refer to pci_device_iommu_address_space()). Normally when
> that happens, there will be one specific IOMMU (or say, translation
> unit) stands right behind that new address space.
> 
> This iommu_get() fetches that guy behind the address space. Here, the
> guy is defined as IOMMUObject, which is currently a (void *). In the
> future, maybe we can make it a better definition, but imho it's good
> enough for now, considering it's arch-dependent.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>

This doesn't make sense to me.  It would be entirely possible for a
single address space to have different regions mapped by different
IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
a device or memory block.

> ---
>  include/exec/memory.h | 30 ++++++++++++++++++++++++++++++
>  memory.c              |  8 ++++++++
>  2 files changed, 38 insertions(+)
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index e5707b3..0b0b58b 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -183,6 +183,15 @@ struct MemoryRegionOps {
>      const MemoryRegionMmio old_mmio;
>  };
>  
> +/*
> + * This stands for an IOMMU unit. Normally it should be exactly the
> + * IOMMU device, however this can also be actually anything which is
> + * related to that translation unit. What it is should be totally
> + * arch-dependent. Maybe one day we can have something better than a
> + * (void *) here.
> + */
> +typedef void *IOMMUObject;
> +
>  typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
>  
>  struct MemoryRegionIOMMUOps {
> @@ -282,6 +291,19 @@ struct MemoryListener {
>  };
>  
>  /**
> + * AddressSpaceOps: callbacks structure for address space specific operations
> + *
> + * @iommu_get: returns an IOMMU object that backs the address space.
> + *             Normally this should be NULL for generic address
> + *             spaces, and it's only used when there is one
> + *             translation unit behind this address space.
> + */
> +struct AddressSpaceOps {
> +    IOMMUObject *(*iommu_get)(AddressSpace *as);
> +};
> +typedef struct AddressSpaceOps AddressSpaceOps;
> +
> +/**
>   * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
>   */
>  struct AddressSpace {
> @@ -302,6 +324,7 @@ struct AddressSpace {
>      MemoryListener dispatch_listener;
>      QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> +    AddressSpaceOps as_ops;
>  };
>  
>  /**
> @@ -1800,6 +1823,13 @@ address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
>      address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
>  }
>  
> +/**
> + * address_space_iommu_get: Get the backend IOMMU for the address space
> + *
> + * @as: the address space to fetch IOMMU from
> + */
> +IOMMUObject *address_space_iommu_get(AddressSpace *as);
> +
>  #endif
>  
>  #endif
> diff --git a/memory.c b/memory.c
> index 6af523e..6aaad45 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -2500,6 +2500,14 @@ void address_space_destroy(AddressSpace *as)
>      call_rcu(as, do_address_space_destroy, rcu);
>  }
>  
> +IOMMUObject *address_space_iommu_get(AddressSpace *as)
> +{
> +    if (!as->as_ops.iommu_get) {
> +        return NULL;
> +    }
> +    return as->as_ops.iommu_get(as);
> +}
> +
>  static const char *memory_region_type(MemoryRegion *mr)
>  {
>      if (memory_region_is_ram_device(mr)) {
Liu, Yi L May 7, 2017, 9:44 a.m. UTC | #2
On Mon, May 08, 2017 at 03:32:17PM +0800, Peter Xu wrote:
> On Mon, May 08, 2017 at 04:07:44PM +1000, David Gibson wrote:
> > On Mon, May 08, 2017 at 01:48:14PM +0800, Peter Xu wrote:
> > > On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> > > > On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > > > > This is something similar to MemoryRegionOps, it's just for address
> > > > > spaces to store arch-specific hooks.
> > > > > 
> > > > > The first hook I would like to introduce is iommu_get().
> > > > > 
> > > > > For systems that have IOMMUs, we will create a special address space per
> > > > > device which is different from system default address space for
> > > > > it (please refer to pci_device_iommu_address_space()). Normally when
> > > > > that happens, there will be one specific IOMMU (or say, translation
> > > > > unit) stands right behind that new address space.
> > > > > 
> > > > > This iommu_get() fetches that guy behind the address space. Here, the
> > > > > guy is defined as IOMMUObject, which is currently a (void *). In the
> > > > > future, maybe we can make it a better definition, but imho it's good
> > > > > enough for now, considering it's arch-dependent.
> > > > > 
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > 
> > > > This doesn't make sense to me.  It would be entirely possible for a
> > > > single address space to have different regions mapped by different
> > > > IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> > > > a device or memory block.
> > > 
> > > Oh, so it's more complicated than I thought... Then, do we really have
> > > existing use case that one device is managed by more than one IOMMU
> > > (on any of the platform)? Frankly speaking I haven't thought about
> > > complicated scenarios like this, or nested IOMMUs yet.
> > 
> > Sort of, it depends what you count as "more than one IOMMU".
> > 
> > spapr can - depending on guest configuration - have two IOMMU windows
> > for each guest PCI domain.  In theory the guest can set these up
> > however it wants, in practice there's usually a small (~256MiB) at PCI
> > address 0 for the benefit of 32-bit PCI devices, then a much larger
> > window up at a high address to allow better performance for 64-bit
> > capable devices.
> > 
> > Those are the same IOMMU in the sense that they're both implemented by
> > logic built into the same virtual PCI host bridge.  However, they're
> > different IOMMUs in the sense that they have independent data
> > structures describing the mappings and are currently modelled as two
> > different IOMMU memory regions.
> > 
> > 
> > I don't believe we have any existing platforms with both an IOMMU and
> > a direct mapped window in a device's address space.  But it seems to
> > be just too plausible a setup to not plan for it. [1]
> > 
> > > This patch derived from a requirement in virt-svm project (on x86).
> > > Virt-svm needs some notification mechanism for each IOMMU (or say, the
> > > IOMMU that managers the SVM-enabled device). For now, all IOMMU
> > > notifiers are per-memory-region not per-iommu, and that's imho not
> > > what virt-svm wants. Any suggestions?
> > 
> > I don't know SVM, so I can't really make sense of that.  What format
> > does this identifier need?  What does "for one IOMMU" mean in this
> > context - i.e. what guest observable properties require the IDs to be
> > the same or to be different.
> 
> Virt-svm should need to trap the content of a register (actually the
> data is in the memory, but, let's assume it's a mmio operation for
> simplicity, considering it is finally delivered via invalidation
> requests), then pass that info down to kernel. So the listened element
> is per-iommu not per-mr this time. When the content changed, vfio will
> need to be notified, then pass this info down.
> 
> Yi/others, please feel free to correct me.

Hi David,

May be I can provide some detail to assist your review.

virt-SVM wants two notifiers so far.
* notifier1: link guest pasid table to host, it would be triggered when
  context entry is changed in guest
* notifier2: passdown iommu tlb invalidate, the mappings to be invalidated
  is GVA->GPA mappings. It just wants to flush IOTLB in host, no need to
  change the page table in memory with this notifer.

The two new notifiers must be registered when vIOMMU is exposed to guest.
Context entry flush triggers notifier1. iommu tlb flushing(extended-IOTLB/
pasid-cache/extended-device-IOTLB flushing in VT-d) triggers notifier2.

Current MAP/UNMAP notifiers are actually registered when there is memory
region event. In the latest code, the event is address space switching.
They are triggered when guest changed IOVA->GPA mappings.

Both the existed notifier and the new notifier would be triggered by iommu
events(iotlb flush)in guest. However, the registration is different. New
notifier depends on vIOMMU exposure. It's per-iommu unit as Peter mentioned.
While MAP/UNMAP registration depends on iommu memory region, so it's more
like memory region notifier.

Thanks,
Yi L
 
> Thanks,
> 
> > 
> > 
> > [1] My reasoning here is similar to the reason sPAPR allows the two
> > windows.  For PAPR, the guest is paravirtualized, so both windows
> > essentially have to be remapped IOMMU windows.  For a bare metal
> > platform it seems a very reasonable tradeoff would be to have a
> > small(ish) 32-bit IOMMU window to allow 32-bit devices to work on a
> > large RAM machine, along with a large direct mapped "bypass" window
> > for maxmimum performance for 64-bit devices.
> > 
> > -- 
> > David Gibson			| I'll have my music baroque, and my code
> > david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> > 				| _way_ _around_!
> > http://www.ozlabs.org/~dgibson
> 
> -- 
> Peter Xu
>
Peter Xu May 8, 2017, 5:48 a.m. UTC | #3
On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > This is something similar to MemoryRegionOps, it's just for address
> > spaces to store arch-specific hooks.
> > 
> > The first hook I would like to introduce is iommu_get().
> > 
> > For systems that have IOMMUs, we will create a special address space per
> > device which is different from system default address space for
> > it (please refer to pci_device_iommu_address_space()). Normally when
> > that happens, there will be one specific IOMMU (or say, translation
> > unit) stands right behind that new address space.
> > 
> > This iommu_get() fetches that guy behind the address space. Here, the
> > guy is defined as IOMMUObject, which is currently a (void *). In the
> > future, maybe we can make it a better definition, but imho it's good
> > enough for now, considering it's arch-dependent.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> This doesn't make sense to me.  It would be entirely possible for a
> single address space to have different regions mapped by different
> IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> a device or memory block.

Oh, so it's more complicated than I thought... Then, do we really have
existing use case that one device is managed by more than one IOMMU
(on any of the platform)? Frankly speaking I haven't thought about
complicated scenarios like this, or nested IOMMUs yet.

This patch derived from a requirement in virt-svm project (on x86).
Virt-svm needs some notification mechanism for each IOMMU (or say, the
IOMMU that managers the SVM-enabled device). For now, all IOMMU
notifiers are per-memory-region not per-iommu, and that's imho not
what virt-svm wants. Any suggestions?

Thanks,
David Gibson May 8, 2017, 6:07 a.m. UTC | #4
On Mon, May 08, 2017 at 01:48:14PM +0800, Peter Xu wrote:
> On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> > On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > > This is something similar to MemoryRegionOps, it's just for address
> > > spaces to store arch-specific hooks.
> > > 
> > > The first hook I would like to introduce is iommu_get().
> > > 
> > > For systems that have IOMMUs, we will create a special address space per
> > > device which is different from system default address space for
> > > it (please refer to pci_device_iommu_address_space()). Normally when
> > > that happens, there will be one specific IOMMU (or say, translation
> > > unit) stands right behind that new address space.
> > > 
> > > This iommu_get() fetches that guy behind the address space. Here, the
> > > guy is defined as IOMMUObject, which is currently a (void *). In the
> > > future, maybe we can make it a better definition, but imho it's good
> > > enough for now, considering it's arch-dependent.
> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > 
> > This doesn't make sense to me.  It would be entirely possible for a
> > single address space to have different regions mapped by different
> > IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> > a device or memory block.
> 
> Oh, so it's more complicated than I thought... Then, do we really have
> existing use case that one device is managed by more than one IOMMU
> (on any of the platform)? Frankly speaking I haven't thought about
> complicated scenarios like this, or nested IOMMUs yet.

Sort of, it depends what you count as "more than one IOMMU".

spapr can - depending on guest configuration - have two IOMMU windows
for each guest PCI domain.  In theory the guest can set these up
however it wants, in practice there's usually a small (~256MiB) at PCI
address 0 for the benefit of 32-bit PCI devices, then a much larger
window up at a high address to allow better performance for 64-bit
capable devices.

Those are the same IOMMU in the sense that they're both implemented by
logic built into the same virtual PCI host bridge.  However, they're
different IOMMUs in the sense that they have independent data
structures describing the mappings and are currently modelled as two
different IOMMU memory regions.


I don't believe we have any existing platforms with both an IOMMU and
a direct mapped window in a device's address space.  But it seems to
be just too plausible a setup to not plan for it. [1]

> This patch derived from a requirement in virt-svm project (on x86).
> Virt-svm needs some notification mechanism for each IOMMU (or say, the
> IOMMU that managers the SVM-enabled device). For now, all IOMMU
> notifiers are per-memory-region not per-iommu, and that's imho not
> what virt-svm wants. Any suggestions?

I don't know SVM, so I can't really make sense of that.  What format
does this identifier need?  What does "for one IOMMU" mean in this
context - i.e. what guest observable properties require the IDs to be
the same or to be different.


[1] My reasoning here is similar to the reason sPAPR allows the two
windows.  For PAPR, the guest is paravirtualized, so both windows
essentially have to be remapped IOMMU windows.  For a bare metal
platform it seems a very reasonable tradeoff would be to have a
small(ish) 32-bit IOMMU window to allow 32-bit devices to work on a
large RAM machine, along with a large direct mapped "bypass" window
for maxmimum performance for 64-bit devices.
Peter Xu May 8, 2017, 7:32 a.m. UTC | #5
On Mon, May 08, 2017 at 04:07:44PM +1000, David Gibson wrote:
> On Mon, May 08, 2017 at 01:48:14PM +0800, Peter Xu wrote:
> > On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> > > On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > > > This is something similar to MemoryRegionOps, it's just for address
> > > > spaces to store arch-specific hooks.
> > > > 
> > > > The first hook I would like to introduce is iommu_get().
> > > > 
> > > > For systems that have IOMMUs, we will create a special address space per
> > > > device which is different from system default address space for
> > > > it (please refer to pci_device_iommu_address_space()). Normally when
> > > > that happens, there will be one specific IOMMU (or say, translation
> > > > unit) stands right behind that new address space.
> > > > 
> > > > This iommu_get() fetches that guy behind the address space. Here, the
> > > > guy is defined as IOMMUObject, which is currently a (void *). In the
> > > > future, maybe we can make it a better definition, but imho it's good
> > > > enough for now, considering it's arch-dependent.
> > > > 
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > 
> > > This doesn't make sense to me.  It would be entirely possible for a
> > > single address space to have different regions mapped by different
> > > IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> > > a device or memory block.
> > 
> > Oh, so it's more complicated than I thought... Then, do we really have
> > existing use case that one device is managed by more than one IOMMU
> > (on any of the platform)? Frankly speaking I haven't thought about
> > complicated scenarios like this, or nested IOMMUs yet.
> 
> Sort of, it depends what you count as "more than one IOMMU".
> 
> spapr can - depending on guest configuration - have two IOMMU windows
> for each guest PCI domain.  In theory the guest can set these up
> however it wants, in practice there's usually a small (~256MiB) at PCI
> address 0 for the benefit of 32-bit PCI devices, then a much larger
> window up at a high address to allow better performance for 64-bit
> capable devices.
> 
> Those are the same IOMMU in the sense that they're both implemented by
> logic built into the same virtual PCI host bridge.  However, they're
> different IOMMUs in the sense that they have independent data
> structures describing the mappings and are currently modelled as two
> different IOMMU memory regions.
> 
> 
> I don't believe we have any existing platforms with both an IOMMU and
> a direct mapped window in a device's address space.  But it seems to
> be just too plausible a setup to not plan for it. [1]
> 
> > This patch derived from a requirement in virt-svm project (on x86).
> > Virt-svm needs some notification mechanism for each IOMMU (or say, the
> > IOMMU that managers the SVM-enabled device). For now, all IOMMU
> > notifiers are per-memory-region not per-iommu, and that's imho not
> > what virt-svm wants. Any suggestions?
> 
> I don't know SVM, so I can't really make sense of that.  What format
> does this identifier need?  What does "for one IOMMU" mean in this
> context - i.e. what guest observable properties require the IDs to be
> the same or to be different.

Virt-svm should need to trap the content of a register (actually the
data is in the memory, but, let's assume it's a mmio operation for
simplicity, considering it is finally delivered via invalidation
requests), then pass that info down to kernel. So the listened element
is per-iommu not per-mr this time. When the content changed, vfio will
need to be notified, then pass this info down.

Yi/others, please feel free to correct me.

Thanks,

> 
> 
> [1] My reasoning here is similar to the reason sPAPR allows the two
> windows.  For PAPR, the guest is paravirtualized, so both windows
> essentially have to be remapped IOMMU windows.  For a bare metal
> platform it seems a very reasonable tradeoff would be to have a
> small(ish) 32-bit IOMMU window to allow 32-bit devices to work on a
> large RAM machine, along with a large direct mapped "bypass" window
> for maxmimum performance for 64-bit devices.
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson
David Gibson May 10, 2017, 7:04 a.m. UTC | #6
On Mon, May 08, 2017 at 03:32:17PM +0800, Peter Xu wrote:
> On Mon, May 08, 2017 at 04:07:44PM +1000, David Gibson wrote:
> > On Mon, May 08, 2017 at 01:48:14PM +0800, Peter Xu wrote:
> > > On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> > > > On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > > > > This is something similar to MemoryRegionOps, it's just for address
> > > > > spaces to store arch-specific hooks.
> > > > > 
> > > > > The first hook I would like to introduce is iommu_get().
> > > > > 
> > > > > For systems that have IOMMUs, we will create a special address space per
> > > > > device which is different from system default address space for
> > > > > it (please refer to pci_device_iommu_address_space()). Normally when
> > > > > that happens, there will be one specific IOMMU (or say, translation
> > > > > unit) stands right behind that new address space.
> > > > > 
> > > > > This iommu_get() fetches that guy behind the address space. Here, the
> > > > > guy is defined as IOMMUObject, which is currently a (void *). In the
> > > > > future, maybe we can make it a better definition, but imho it's good
> > > > > enough for now, considering it's arch-dependent.
> > > > > 
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > 
> > > > This doesn't make sense to me.  It would be entirely possible for a
> > > > single address space to have different regions mapped by different
> > > > IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> > > > a device or memory block.
> > > 
> > > Oh, so it's more complicated than I thought... Then, do we really have
> > > existing use case that one device is managed by more than one IOMMU
> > > (on any of the platform)? Frankly speaking I haven't thought about
> > > complicated scenarios like this, or nested IOMMUs yet.
> > 
> > Sort of, it depends what you count as "more than one IOMMU".
> > 
> > spapr can - depending on guest configuration - have two IOMMU windows
> > for each guest PCI domain.  In theory the guest can set these up
> > however it wants, in practice there's usually a small (~256MiB) at PCI
> > address 0 for the benefit of 32-bit PCI devices, then a much larger
> > window up at a high address to allow better performance for 64-bit
> > capable devices.
> > 
> > Those are the same IOMMU in the sense that they're both implemented by
> > logic built into the same virtual PCI host bridge.  However, they're
> > different IOMMUs in the sense that they have independent data
> > structures describing the mappings and are currently modelled as two
> > different IOMMU memory regions.
> > 
> > 
> > I don't believe we have any existing platforms with both an IOMMU and
> > a direct mapped window in a device's address space.  But it seems to
> > be just too plausible a setup to not plan for it. [1]
> > 
> > > This patch derived from a requirement in virt-svm project (on x86).
> > > Virt-svm needs some notification mechanism for each IOMMU (or say, the
> > > IOMMU that managers the SVM-enabled device). For now, all IOMMU
> > > notifiers are per-memory-region not per-iommu, and that's imho not
> > > what virt-svm wants. Any suggestions?
> > 
> > I don't know SVM, so I can't really make sense of that.  What format
> > does this identifier need?  What does "for one IOMMU" mean in this
> > context - i.e. what guest observable properties require the IDs to be
> > the same or to be different.
> 
> Virt-svm should need to trap the content of a register (actually the
> data is in the memory, but, let's assume it's a mmio operation for
> simplicity, considering it is finally delivered via invalidation
> requests), then pass that info down to kernel. So the listened element
> is per-iommu not per-mr this time. When the content changed, vfio will
> need to be notified, then pass this info down.

I don't entirely follow what you're saying.  When the virtual hardware
gets an invalidate request, it looks up the unit to invalidate in
memory?  Which component gets to decide that ID?  How is it advertised
to the guest OS?

If your ID is tied to the AS now, you could just iterate through the
AS and invalidate any IOMMU MRs that are present within it.

Alternatively, if the ID is tied to something more concrete, like a
specific PCI host bridge (which incorporates the IOMMU logic), then
that device probably already has a handle on the right IOMMU MR to
invalidate it.


> 
> Yi/others, please feel free to correct me.
> 
> Thanks,
> 
> > 
> > 
> > [1] My reasoning here is similar to the reason sPAPR allows the two
> > windows.  For PAPR, the guest is paravirtualized, so both windows
> > essentially have to be remapped IOMMU windows.  For a bare metal
> > platform it seems a very reasonable tradeoff would be to have a
> > small(ish) 32-bit IOMMU window to allow 32-bit devices to work on a
> > large RAM machine, along with a large direct mapped "bypass" window
> > for maxmimum performance for 64-bit devices.
> > 
>
Peter Xu May 11, 2017, 5:04 a.m. UTC | #7
On Wed, May 10, 2017 at 05:04:06PM +1000, David Gibson wrote:
> On Mon, May 08, 2017 at 03:32:17PM +0800, Peter Xu wrote:
> > On Mon, May 08, 2017 at 04:07:44PM +1000, David Gibson wrote:
> > > On Mon, May 08, 2017 at 01:48:14PM +0800, Peter Xu wrote:
> > > > On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> > > > > On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > > > > > This is something similar to MemoryRegionOps, it's just for address
> > > > > > spaces to store arch-specific hooks.
> > > > > > 
> > > > > > The first hook I would like to introduce is iommu_get().
> > > > > > 
> > > > > > For systems that have IOMMUs, we will create a special address space per
> > > > > > device which is different from system default address space for
> > > > > > it (please refer to pci_device_iommu_address_space()). Normally when
> > > > > > that happens, there will be one specific IOMMU (or say, translation
> > > > > > unit) stands right behind that new address space.
> > > > > > 
> > > > > > This iommu_get() fetches that guy behind the address space. Here, the
> > > > > > guy is defined as IOMMUObject, which is currently a (void *). In the
> > > > > > future, maybe we can make it a better definition, but imho it's good
> > > > > > enough for now, considering it's arch-dependent.
> > > > > > 
> > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > 
> > > > > This doesn't make sense to me.  It would be entirely possible for a
> > > > > single address space to have different regions mapped by different
> > > > > IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> > > > > a device or memory block.
> > > > 
> > > > Oh, so it's more complicated than I thought... Then, do we really have
> > > > existing use case that one device is managed by more than one IOMMU
> > > > (on any of the platform)? Frankly speaking I haven't thought about
> > > > complicated scenarios like this, or nested IOMMUs yet.
> > > 
> > > Sort of, it depends what you count as "more than one IOMMU".
> > > 
> > > spapr can - depending on guest configuration - have two IOMMU windows
> > > for each guest PCI domain.  In theory the guest can set these up
> > > however it wants, in practice there's usually a small (~256MiB) at PCI
> > > address 0 for the benefit of 32-bit PCI devices, then a much larger
> > > window up at a high address to allow better performance for 64-bit
> > > capable devices.
> > > 
> > > Those are the same IOMMU in the sense that they're both implemented by
> > > logic built into the same virtual PCI host bridge.  However, they're
> > > different IOMMUs in the sense that they have independent data
> > > structures describing the mappings and are currently modelled as two
> > > different IOMMU memory regions.
> > > 
> > > 
> > > I don't believe we have any existing platforms with both an IOMMU and
> > > a direct mapped window in a device's address space.  But it seems to
> > > be just too plausible a setup to not plan for it. [1]
> > > 
> > > > This patch derived from a requirement in virt-svm project (on x86).
> > > > Virt-svm needs some notification mechanism for each IOMMU (or say, the
> > > > IOMMU that managers the SVM-enabled device). For now, all IOMMU
> > > > notifiers are per-memory-region not per-iommu, and that's imho not
> > > > what virt-svm wants. Any suggestions?
> > > 
> > > I don't know SVM, so I can't really make sense of that.  What format
> > > does this identifier need?  What does "for one IOMMU" mean in this
> > > context - i.e. what guest observable properties require the IDs to be
> > > the same or to be different.
> > 
> > Virt-svm should need to trap the content of a register (actually the
> > data is in the memory, but, let's assume it's a mmio operation for
> > simplicity, considering it is finally delivered via invalidation
> > requests), then pass that info down to kernel. So the listened element
> > is per-iommu not per-mr this time. When the content changed, vfio will
> > need to be notified, then pass this info down.
> 
> I don't entirely follow what you're saying.  When the virtual hardware
> gets an invalidate request, it looks up the unit to invalidate in
> memory?  Which component gets to decide that ID?  How is it advertised
> to the guest OS?
> 
> If your ID is tied to the AS now, you could just iterate through the
> AS and invalidate any IOMMU MRs that are present within it.
> 
> Alternatively, if the ID is tied to something more concrete, like a
> specific PCI host bridge (which incorporates the IOMMU logic), then
> that device probably already has a handle on the right IOMMU MR to
> invalidate it.

Sorry to be unclear on the requirement. I don't know what's the ID you
mentioned above... Anyway, let me try to further simplify the use
case.

Just assume we have such a requirement: when one register of vIOMMU
changes, we need to pass this register data to the hardware IOMMU by
some way. And, let's assume this is a notification mechanism, so that
every device in the system can listen to this register change, then
capture what has changed to what. Here the point is, in all cases this
event is not related to memory region at all. So imho we need some
other way to do it besides memory region IOMMU notifiers.

Thanks,
David Gibson May 15, 2017, 5:32 a.m. UTC | #8
On Thu, May 11, 2017 at 01:04:26PM +0800, Peter Xu wrote:
> On Wed, May 10, 2017 at 05:04:06PM +1000, David Gibson wrote:
> > On Mon, May 08, 2017 at 03:32:17PM +0800, Peter Xu wrote:
> > > On Mon, May 08, 2017 at 04:07:44PM +1000, David Gibson wrote:
> > > > On Mon, May 08, 2017 at 01:48:14PM +0800, Peter Xu wrote:
> > > > > On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> > > > > > On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > > > > > > This is something similar to MemoryRegionOps, it's just for address
> > > > > > > spaces to store arch-specific hooks.
> > > > > > > 
> > > > > > > The first hook I would like to introduce is iommu_get().
> > > > > > > 
> > > > > > > For systems that have IOMMUs, we will create a special address space per
> > > > > > > device which is different from system default address space for
> > > > > > > it (please refer to pci_device_iommu_address_space()). Normally when
> > > > > > > that happens, there will be one specific IOMMU (or say, translation
> > > > > > > unit) stands right behind that new address space.
> > > > > > > 
> > > > > > > This iommu_get() fetches that guy behind the address space. Here, the
> > > > > > > guy is defined as IOMMUObject, which is currently a (void *). In the
> > > > > > > future, maybe we can make it a better definition, but imho it's good
> > > > > > > enough for now, considering it's arch-dependent.
> > > > > > > 
> > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > 
> > > > > > This doesn't make sense to me.  It would be entirely possible for a
> > > > > > single address space to have different regions mapped by different
> > > > > > IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> > > > > > a device or memory block.
> > > > > 
> > > > > Oh, so it's more complicated than I thought... Then, do we really have
> > > > > existing use case that one device is managed by more than one IOMMU
> > > > > (on any of the platform)? Frankly speaking I haven't thought about
> > > > > complicated scenarios like this, or nested IOMMUs yet.
> > > > 
> > > > Sort of, it depends what you count as "more than one IOMMU".
> > > > 
> > > > spapr can - depending on guest configuration - have two IOMMU windows
> > > > for each guest PCI domain.  In theory the guest can set these up
> > > > however it wants, in practice there's usually a small (~256MiB) at PCI
> > > > address 0 for the benefit of 32-bit PCI devices, then a much larger
> > > > window up at a high address to allow better performance for 64-bit
> > > > capable devices.
> > > > 
> > > > Those are the same IOMMU in the sense that they're both implemented by
> > > > logic built into the same virtual PCI host bridge.  However, they're
> > > > different IOMMUs in the sense that they have independent data
> > > > structures describing the mappings and are currently modelled as two
> > > > different IOMMU memory regions.
> > > > 
> > > > 
> > > > I don't believe we have any existing platforms with both an IOMMU and
> > > > a direct mapped window in a device's address space.  But it seems to
> > > > be just too plausible a setup to not plan for it. [1]
> > > > 
> > > > > This patch derived from a requirement in virt-svm project (on x86).
> > > > > Virt-svm needs some notification mechanism for each IOMMU (or say, the
> > > > > IOMMU that managers the SVM-enabled device). For now, all IOMMU
> > > > > notifiers are per-memory-region not per-iommu, and that's imho not
> > > > > what virt-svm wants. Any suggestions?
> > > > 
> > > > I don't know SVM, so I can't really make sense of that.  What format
> > > > does this identifier need?  What does "for one IOMMU" mean in this
> > > > context - i.e. what guest observable properties require the IDs to be
> > > > the same or to be different.
> > > 
> > > Virt-svm should need to trap the content of a register (actually the
> > > data is in the memory, but, let's assume it's a mmio operation for
> > > simplicity, considering it is finally delivered via invalidation
> > > requests), then pass that info down to kernel. So the listened element
> > > is per-iommu not per-mr this time. When the content changed, vfio will
> > > need to be notified, then pass this info down.
> > 
> > I don't entirely follow what you're saying.  When the virtual hardware
> > gets an invalidate request, it looks up the unit to invalidate in
> > memory?  Which component gets to decide that ID?  How is it advertised
> > to the guest OS?
> > 
> > If your ID is tied to the AS now, you could just iterate through the
> > AS and invalidate any IOMMU MRs that are present within it.
> > 
> > Alternatively, if the ID is tied to something more concrete, like a
> > specific PCI host bridge (which incorporates the IOMMU logic), then
> > that device probably already has a handle on the right IOMMU MR to
> > invalidate it.
> 
> Sorry to be unclear on the requirement. I don't know what's the ID you
> mentioned above... Anyway, let me try to further simplify the use
> case.

Right, the ID was me guessing badly at what's going on here, so I
think it confused rather than clarifying.

> Just assume we have such a requirement: when one register of vIOMMU
> changes, we need to pass this register data to the hardware IOMMU by
> some way. And, let's assume this is a notification mechanism, so that
> every device in the system can listen to this register change, then
> capture what has changed to what. Here the point is, in all cases this
> event is not related to memory region at all. So imho we need some
> other way to do it besides memory region IOMMU notifiers.

Ok.  So is this right?
	* You have a single bank of vIOMMU registers
	* Which control two (or more) IOMMU regions in in the guest's
          address space
	* Assuming the host also has an AMD IOMMU, those will be
          backed by a single IOMMU on the host ("single" meaning
          controlled by a single bank of host registers)

I'm assuming the guest IOMMU code must know which IOMMU regions it is
managing, so getting from the guest registers to the set of IOMMU MRs
should be easy.

What's the operation that needs to happen on the host IOMMU, in terms
of the VFIO IOMMU interface?

Is this inherently only possible if both host and guest have an AMD
IOMMU?  Could it be made to work if the guest had an AMD IOMMU but the
host had an Intel one, or the other way around?

Would it make sense to have a single IOMMU MR in the guest, but
instead of mapping it whole into the guest address space, have two
(or more) alias MRs in the AS which each allow access to a portion of
the IOMMU MR?
Peter Xu May 25, 2017, 7:24 a.m. UTC | #9
On Mon, May 15, 2017 at 03:32:11PM +1000, David Gibson wrote:
> On Thu, May 11, 2017 at 01:04:26PM +0800, Peter Xu wrote:
> > On Wed, May 10, 2017 at 05:04:06PM +1000, David Gibson wrote:
> > > On Mon, May 08, 2017 at 03:32:17PM +0800, Peter Xu wrote:
> > > > On Mon, May 08, 2017 at 04:07:44PM +1000, David Gibson wrote:
> > > > > On Mon, May 08, 2017 at 01:48:14PM +0800, Peter Xu wrote:
> > > > > > On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> > > > > > > On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > > > > > > > This is something similar to MemoryRegionOps, it's just for address
> > > > > > > > spaces to store arch-specific hooks.
> > > > > > > > 
> > > > > > > > The first hook I would like to introduce is iommu_get().
> > > > > > > > 
> > > > > > > > For systems that have IOMMUs, we will create a special address space per
> > > > > > > > device which is different from system default address space for
> > > > > > > > it (please refer to pci_device_iommu_address_space()). Normally when
> > > > > > > > that happens, there will be one specific IOMMU (or say, translation
> > > > > > > > unit) stands right behind that new address space.
> > > > > > > > 
> > > > > > > > This iommu_get() fetches that guy behind the address space. Here, the
> > > > > > > > guy is defined as IOMMUObject, which is currently a (void *). In the
> > > > > > > > future, maybe we can make it a better definition, but imho it's good
> > > > > > > > enough for now, considering it's arch-dependent.
> > > > > > > > 
> > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > > 
> > > > > > > This doesn't make sense to me.  It would be entirely possible for a
> > > > > > > single address space to have different regions mapped by different
> > > > > > > IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> > > > > > > a device or memory block.
> > > > > > 
> > > > > > Oh, so it's more complicated than I thought... Then, do we really have
> > > > > > existing use case that one device is managed by more than one IOMMU
> > > > > > (on any of the platform)? Frankly speaking I haven't thought about
> > > > > > complicated scenarios like this, or nested IOMMUs yet.
> > > > > 
> > > > > Sort of, it depends what you count as "more than one IOMMU".
> > > > > 
> > > > > spapr can - depending on guest configuration - have two IOMMU windows
> > > > > for each guest PCI domain.  In theory the guest can set these up
> > > > > however it wants, in practice there's usually a small (~256MiB) at PCI
> > > > > address 0 for the benefit of 32-bit PCI devices, then a much larger
> > > > > window up at a high address to allow better performance for 64-bit
> > > > > capable devices.
> > > > > 
> > > > > Those are the same IOMMU in the sense that they're both implemented by
> > > > > logic built into the same virtual PCI host bridge.  However, they're
> > > > > different IOMMUs in the sense that they have independent data
> > > > > structures describing the mappings and are currently modelled as two
> > > > > different IOMMU memory regions.

[1]

> > > > > 
> > > > > 
> > > > > I don't believe we have any existing platforms with both an IOMMU and
> > > > > a direct mapped window in a device's address space.  But it seems to
> > > > > be just too plausible a setup to not plan for it. [1]
> > > > > 
> > > > > > This patch derived from a requirement in virt-svm project (on x86).
> > > > > > Virt-svm needs some notification mechanism for each IOMMU (or say, the
> > > > > > IOMMU that managers the SVM-enabled device). For now, all IOMMU
> > > > > > notifiers are per-memory-region not per-iommu, and that's imho not
> > > > > > what virt-svm wants. Any suggestions?
> > > > > 
> > > > > I don't know SVM, so I can't really make sense of that.  What format
> > > > > does this identifier need?  What does "for one IOMMU" mean in this
> > > > > context - i.e. what guest observable properties require the IDs to be
> > > > > the same or to be different.
> > > > 
> > > > Virt-svm should need to trap the content of a register (actually the
> > > > data is in the memory, but, let's assume it's a mmio operation for
> > > > simplicity, considering it is finally delivered via invalidation
> > > > requests), then pass that info down to kernel. So the listened element
> > > > is per-iommu not per-mr this time. When the content changed, vfio will
> > > > need to be notified, then pass this info down.
> > > 
> > > I don't entirely follow what you're saying.  When the virtual hardware
> > > gets an invalidate request, it looks up the unit to invalidate in
> > > memory?  Which component gets to decide that ID?  How is it advertised
> > > to the guest OS?
> > > 
> > > If your ID is tied to the AS now, you could just iterate through the
> > > AS and invalidate any IOMMU MRs that are present within it.
> > > 
> > > Alternatively, if the ID is tied to something more concrete, like a
> > > specific PCI host bridge (which incorporates the IOMMU logic), then
> > > that device probably already has a handle on the right IOMMU MR to
> > > invalidate it.
> > 
> > Sorry to be unclear on the requirement. I don't know what's the ID you
> > mentioned above... Anyway, let me try to further simplify the use
> > case.
> 
> Right, the ID was me guessing badly at what's going on here, so I
> think it confused rather than clarifying.
> 
> > Just assume we have such a requirement: when one register of vIOMMU
> > changes, we need to pass this register data to the hardware IOMMU by
> > some way. And, let's assume this is a notification mechanism, so that
> > every device in the system can listen to this register change, then
> > capture what has changed to what. Here the point is, in all cases this
> > event is not related to memory region at all. So imho we need some
> > other way to do it besides memory region IOMMU notifiers.
> 
> Ok.  So is this right?
> 	* You have a single bank of vIOMMU registers
> 	* Which control two (or more) IOMMU regions in in the guest's
>           address space
> 	* Assuming the host also has an AMD IOMMU, those will be
>           backed by a single IOMMU on the host ("single" meaning
>           controlled by a single bank of host registers)
> 
> I'm assuming the guest IOMMU code must know which IOMMU regions it is
> managing, so getting from the guest registers to the set of IOMMU MRs
> should be easy.
> 
> What's the operation that needs to happen on the host IOMMU, in terms
> of the VFIO IOMMU interface?

(Sorry to respond so late...)

It'll pass the captured data downward to host IOMMU in some way.

IMHO if we are discussing the notifier thing only, we don't really
need to know what would it do after it gets the data. The point is how
we should define this kind if notifies, which differs from current
memory region based notifiers.

> 
> Is this inherently only possible if both host and guest have an AMD
> IOMMU?  Could it be made to work if the guest had an AMD IOMMU but the
> host had an Intel one, or the other way around?
> 
> Would it make sense to have a single IOMMU MR in the guest, but
> instead of mapping it whole into the guest address space, have two
> (or more) alias MRs in the AS which each allow access to a portion of
> the IOMMU MR?

For these questions, again I don't know whether it'll affect how we
design a notifier mechanism to the remapping unit... Would it really?
Or maybe I missed anything?

Till now, after I know the case for SPAPR you have explained [1]
(thanks btw!), could I say that these multiple IOMMU windows still be
backed by some unified hardware in the PCI host bridge? Can that be
the so-called single "IOMMU" object behind that device? And, would it
possible that we might have similar requirement in the future just
like what Yi has met with virt-svm? (I don't know whether Power would
support SVM or similar, but I guess ARM should support it?)

I am just thinking how we can define a better and general (for all
platforms) IOMMU model in QEMU that can best suite our needs. And
currently that should be a model that can satisfy Yi's requirement.

Thanks,
David Gibson May 26, 2017, 5:30 a.m. UTC | #10
On Thu, May 25, 2017 at 03:24:30PM +0800, Peter Xu wrote:
> On Mon, May 15, 2017 at 03:32:11PM +1000, David Gibson wrote:
> > On Thu, May 11, 2017 at 01:04:26PM +0800, Peter Xu wrote:
> > > On Wed, May 10, 2017 at 05:04:06PM +1000, David Gibson wrote:
> > > > On Mon, May 08, 2017 at 03:32:17PM +0800, Peter Xu wrote:
> > > > > On Mon, May 08, 2017 at 04:07:44PM +1000, David Gibson wrote:
> > > > > > On Mon, May 08, 2017 at 01:48:14PM +0800, Peter Xu wrote:
> > > > > > > On Mon, May 01, 2017 at 02:58:22PM +1000, David Gibson wrote:
> > > > > > > > On Thu, Apr 27, 2017 at 05:34:18PM +0800, Peter Xu wrote:
> > > > > > > > > This is something similar to MemoryRegionOps, it's just for address
> > > > > > > > > spaces to store arch-specific hooks.
> > > > > > > > > 
> > > > > > > > > The first hook I would like to introduce is iommu_get().
> > > > > > > > > 
> > > > > > > > > For systems that have IOMMUs, we will create a special address space per
> > > > > > > > > device which is different from system default address space for
> > > > > > > > > it (please refer to pci_device_iommu_address_space()). Normally when
> > > > > > > > > that happens, there will be one specific IOMMU (or say, translation
> > > > > > > > > unit) stands right behind that new address space.
> > > > > > > > > 
> > > > > > > > > This iommu_get() fetches that guy behind the address space. Here, the
> > > > > > > > > guy is defined as IOMMUObject, which is currently a (void *). In the
> > > > > > > > > future, maybe we can make it a better definition, but imho it's good
> > > > > > > > > enough for now, considering it's arch-dependent.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > > > 
> > > > > > > > This doesn't make sense to me.  It would be entirely possible for a
> > > > > > > > single address space to have different regions mapped by different
> > > > > > > > IOMMUs.  Or some regions mapped by IOMMUs and others direct mapped to
> > > > > > > > a device or memory block.
> > > > > > > 
> > > > > > > Oh, so it's more complicated than I thought... Then, do we really have
> > > > > > > existing use case that one device is managed by more than one IOMMU
> > > > > > > (on any of the platform)? Frankly speaking I haven't thought about
> > > > > > > complicated scenarios like this, or nested IOMMUs yet.
> > > > > > 
> > > > > > Sort of, it depends what you count as "more than one IOMMU".
> > > > > > 
> > > > > > spapr can - depending on guest configuration - have two IOMMU windows
> > > > > > for each guest PCI domain.  In theory the guest can set these up
> > > > > > however it wants, in practice there's usually a small (~256MiB) at PCI
> > > > > > address 0 for the benefit of 32-bit PCI devices, then a much larger
> > > > > > window up at a high address to allow better performance for 64-bit
> > > > > > capable devices.
> > > > > > 
> > > > > > Those are the same IOMMU in the sense that they're both implemented by
> > > > > > logic built into the same virtual PCI host bridge.  However, they're
> > > > > > different IOMMUs in the sense that they have independent data
> > > > > > structures describing the mappings and are currently modelled as two
> > > > > > different IOMMU memory regions.
> 
> [1]
> 
> > > > > > 
> > > > > > 
> > > > > > I don't believe we have any existing platforms with both an IOMMU and
> > > > > > a direct mapped window in a device's address space.  But it seems to
> > > > > > be just too plausible a setup to not plan for it. [1]
> > > > > > 
> > > > > > > This patch derived from a requirement in virt-svm project (on x86).
> > > > > > > Virt-svm needs some notification mechanism for each IOMMU (or say, the
> > > > > > > IOMMU that managers the SVM-enabled device). For now, all IOMMU
> > > > > > > notifiers are per-memory-region not per-iommu, and that's imho not
> > > > > > > what virt-svm wants. Any suggestions?
> > > > > > 
> > > > > > I don't know SVM, so I can't really make sense of that.  What format
> > > > > > does this identifier need?  What does "for one IOMMU" mean in this
> > > > > > context - i.e. what guest observable properties require the IDs to be
> > > > > > the same or to be different.
> > > > > 
> > > > > Virt-svm should need to trap the content of a register (actually the
> > > > > data is in the memory, but, let's assume it's a mmio operation for
> > > > > simplicity, considering it is finally delivered via invalidation
> > > > > requests), then pass that info down to kernel. So the listened element
> > > > > is per-iommu not per-mr this time. When the content changed, vfio will
> > > > > need to be notified, then pass this info down.
> > > > 
> > > > I don't entirely follow what you're saying.  When the virtual hardware
> > > > gets an invalidate request, it looks up the unit to invalidate in
> > > > memory?  Which component gets to decide that ID?  How is it advertised
> > > > to the guest OS?
> > > > 
> > > > If your ID is tied to the AS now, you could just iterate through the
> > > > AS and invalidate any IOMMU MRs that are present within it.
> > > > 
> > > > Alternatively, if the ID is tied to something more concrete, like a
> > > > specific PCI host bridge (which incorporates the IOMMU logic), then
> > > > that device probably already has a handle on the right IOMMU MR to
> > > > invalidate it.
> > > 
> > > Sorry to be unclear on the requirement. I don't know what's the ID you
> > > mentioned above... Anyway, let me try to further simplify the use
> > > case.
> > 
> > Right, the ID was me guessing badly at what's going on here, so I
> > think it confused rather than clarifying.
> > 
> > > Just assume we have such a requirement: when one register of vIOMMU
> > > changes, we need to pass this register data to the hardware IOMMU by
> > > some way. And, let's assume this is a notification mechanism, so that
> > > every device in the system can listen to this register change, then
> > > capture what has changed to what. Here the point is, in all cases this
> > > event is not related to memory region at all. So imho we need some
> > > other way to do it besides memory region IOMMU notifiers.
> > 
> > Ok.  So is this right?
> > 	* You have a single bank of vIOMMU registers
> > 	* Which control two (or more) IOMMU regions in in the guest's
> >           address space
> > 	* Assuming the host also has an AMD IOMMU, those will be
> >           backed by a single IOMMU on the host ("single" meaning
> >           controlled by a single bank of host registers)
> > 
> > I'm assuming the guest IOMMU code must know which IOMMU regions it is
> > managing, so getting from the guest registers to the set of IOMMU MRs
> > should be easy.
> > 
> > What's the operation that needs to happen on the host IOMMU, in terms
> > of the VFIO IOMMU interface?
> 
> (Sorry to respond so late...)
> 
> It'll pass the captured data downward to host IOMMU in some way.
> 
> IMHO if we are discussing the notifier thing only, we don't really
> need to know what would it do after it gets the data. The point is how
> we should define this kind if notifies, which differs from current
> memory region based notifiers.

I'm trying to understand how it differs - I still don't have a clear
picture.  That's why I'm asking what needs to be passed to the host
MMU, so I can see why you need this different notifier.

> > Is this inherently only possible if both host and guest have an AMD
> > IOMMU?  Could it be made to work if the guest had an AMD IOMMU but the
> > host had an Intel one, or the other way around?
> > 
> > Would it make sense to have a single IOMMU MR in the guest, but
> > instead of mapping it whole into the guest address space, have two
> > (or more) alias MRs in the AS which each allow access to a portion of
> > the IOMMU MR?
> 
> For these questions, again I don't know whether it'll affect how we
> design a notifier mechanism to the remapping unit... Would it really?
> Or maybe I missed anything?
> 
> Till now, after I know the case for SPAPR you have explained [1]
> (thanks btw!), could I say that these multiple IOMMU windows still be
> backed by some unified hardware in the PCI host bridge? Can that be
> the so-called single "IOMMU" object behind that device? And, would it
> possible that we might have similar requirement in the future just
> like what Yi has met with virt-svm? (I don't know whether Power would
> support SVM or similar, but I guess ARM should support it?)

Well, yes, as I've said the two IOMMU windows in sPAPR are the same
IOMMU in the sense that they're implemented by basically the same
logic in the host bridge.

But what consitutes one or multiple IOMMUs all depends on your
definitions, and I'm still not understanding what about your structure
impacts on the notifier design.

> I am just thinking how we can define a better and general (for all
> platforms) IOMMU model in QEMU that can best suite our needs. And
> currently that should be a model that can satisfy Yi's requirement.

I'm still trying to wrap my head around what those requirements are.
Yi Liu June 8, 2017, 8:24 a.m. UTC | #11
Hi David,

I added some update on your question. See if it make any sense.

> -----Original Message-----
> From: David Gibson [mailto:david@gibson.dropbear.id.au]
> Sent: Friday, May 26, 2017 1:30 PM
> To: Peter Xu <peterx@redhat.com>
> On Thu, May 25, 2017 at 03:24:30PM +0800, Peter Xu wrote:

[...]

> > > > > > >
> > > > > > > I don't believe we have any existing platforms with both an
> > > > > > > IOMMU and a direct mapped window in a device's address
> > > > > > > space.  But it seems to be just too plausible a setup to not
> > > > > > > plan for it. [1]
> > > > > > >
> > > > > > > > This patch derived from a requirement in virt-svm project (on x86).
> > > > > > > > Virt-svm needs some notification mechanism for each IOMMU
> > > > > > > > (or say, the IOMMU that managers the SVM-enabled device).
> > > > > > > > For now, all IOMMU notifiers are per-memory-region not
> > > > > > > > per-iommu, and that's imho not what virt-svm wants. Any suggestions?
> > > > > > >
> > > > > > > I don't know SVM, so I can't really make sense of that.
> > > > > > > What format does this identifier need?  What does "for one
> > > > > > > IOMMU" mean in this context - i.e. what guest observable
> > > > > > > properties require the IDs to be the same or to be different.
> > > > > >
> > > > > > Virt-svm should need to trap the content of a register
> > > > > > (actually the data is in the memory, but, let's assume it's a
> > > > > > mmio operation for simplicity, considering it is finally
> > > > > > delivered via invalidation requests), then pass that info down
> > > > > > to kernel. So the listened element is per-iommu not per-mr
> > > > > > this time. When the content changed, vfio will need to be notified, then
> pass this info down.
> > > > >
> > > > > I don't entirely follow what you're saying.  When the virtual
> > > > > hardware gets an invalidate request, it looks up the unit to
> > > > > invalidate in memory?  Which component gets to decide that ID?
> > > > > How is it advertised to the guest OS?
> > > > >
> > > > > If your ID is tied to the AS now, you could just iterate through
> > > > > the AS and invalidate any IOMMU MRs that are present within it.
> > > > >
> > > > > Alternatively, if the ID is tied to something more concrete,
> > > > > like a specific PCI host bridge (which incorporates the IOMMU
> > > > > logic), then that device probably already has a handle on the
> > > > > right IOMMU MR to invalidate it.
> > > >
> > > > Sorry to be unclear on the requirement. I don't know what's the ID
> > > > you mentioned above... Anyway, let me try to further simplify the
> > > > use case.
> > >
> > > Right, the ID was me guessing badly at what's going on here, so I
> > > think it confused rather than clarifying.
> > >
> > > > Just assume we have such a requirement: when one register of
> > > > vIOMMU changes, we need to pass this register data to the hardware
> > > > IOMMU by some way. And, let's assume this is a notification
> > > > mechanism, so that every device in the system can listen to this
> > > > register change, then capture what has changed to what. Here the
> > > > point is, in all cases this event is not related to memory region
> > > > at all. So imho we need some other way to do it besides memory region
> IOMMU notifiers.
> > >
> > > Ok.  So is this right?
> > > 	* You have a single bank of vIOMMU registers
> > > 	* Which control two (or more) IOMMU regions in in the guest's
> > >           address space
> > > 	* Assuming the host also has an AMD IOMMU, those will be
> > >           backed by a single IOMMU on the host ("single" meaning
> > >           controlled by a single bank of host registers)
> > >
> > > I'm assuming the guest IOMMU code must know which IOMMU regions it
> > > is managing, so getting from the guest registers to the set of IOMMU
> > > MRs should be easy.
> > >
> > > What's the operation that needs to happen on the host IOMMU, in
> > > terms of the VFIO IOMMU interface?
> >
> > (Sorry to respond so late...)
> >
> > It'll pass the captured data downward to host IOMMU in some way.
> >
> > IMHO if we are discussing the notifier thing only, we don't really
> > need to know what would it do after it gets the data. The point is how
> > we should define this kind if notifies, which differs from current
> > memory region based notifiers.
> 
> I'm trying to understand how it differs - I still don't have a clear picture.  That's why
> I'm asking what needs to be passed to the host MMU, so I can see why you need this
> different notifier.

It differs in the registration. For the MemoryRegion based notifier(MAP/UNMAP), it
is registered when IOMMU MemoryRegion is used. On VT-d, it is the time switch
system memory address space to vtd address space. This is introduced by the patch
below:

[Qemu-devel] [PATCH v9 8/9] intel_iommu: allow dynamic switch of IOMMU region

For virt-SVM, it wants to add two more notfiers. If we add the notifiers at the same
time with the MAP/UNMAP notifier, then virt-SVM cannot work if there is no IOMMU
region switch. This scenario happens when passthru-mode is used in the vIOMMU.

So in order to solve the scenario above, I discussed with Peter. My initial plan is to move
the registration of new notifiers to be in vfio_realize(). The logic is if vIOMMU is exposed
to guest, then the new notifiers would be registered. This approach is proposed in my
RFC patch for virt-SVM.

https://www.spinics.net/lists/kvm/msg148803.html

However, Peter mentioned the approach in the RFC patch may not be compatible with
sPAPR since there may be multiple MemoryRegion for sPARR. It would result in multiple
notifier registration on sPARR.

Due to it, Peter tries to introduce the IOMMUObject. It would help the detection in
vfio_realize() to check if vIOMMU is exposed to guest. Also it moves the iommu specific
notifiers to be tied on the exposure of vIOMMU instead of IOMMU Region switching.

Any further comments, pls let me know.

Thanks,
Yi L
diff mbox

Patch

diff --git a/include/exec/memory.h b/include/exec/memory.h
index e5707b3..0b0b58b 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -183,6 +183,15 @@  struct MemoryRegionOps {
     const MemoryRegionMmio old_mmio;
 };
 
+/*
+ * This stands for an IOMMU unit. Normally it should be exactly the
+ * IOMMU device, however this can also be actually anything which is
+ * related to that translation unit. What it is should be totally
+ * arch-dependent. Maybe one day we can have something better than a
+ * (void *) here.
+ */
+typedef void *IOMMUObject;
+
 typedef struct MemoryRegionIOMMUOps MemoryRegionIOMMUOps;
 
 struct MemoryRegionIOMMUOps {
@@ -282,6 +291,19 @@  struct MemoryListener {
 };
 
 /**
+ * AddressSpaceOps: callbacks structure for address space specific operations
+ *
+ * @iommu_get: returns an IOMMU object that backs the address space.
+ *             Normally this should be NULL for generic address
+ *             spaces, and it's only used when there is one
+ *             translation unit behind this address space.
+ */
+struct AddressSpaceOps {
+    IOMMUObject *(*iommu_get)(AddressSpace *as);
+};
+typedef struct AddressSpaceOps AddressSpaceOps;
+
+/**
  * AddressSpace: describes a mapping of addresses to #MemoryRegion objects
  */
 struct AddressSpace {
@@ -302,6 +324,7 @@  struct AddressSpace {
     MemoryListener dispatch_listener;
     QTAILQ_HEAD(memory_listeners_as, MemoryListener) listeners;
     QTAILQ_ENTRY(AddressSpace) address_spaces_link;
+    AddressSpaceOps as_ops;
 };
 
 /**
@@ -1800,6 +1823,13 @@  address_space_write_cached(MemoryRegionCache *cache, hwaddr addr,
     address_space_write(cache->as, cache->xlat + addr, MEMTXATTRS_UNSPECIFIED, buf, len);
 }
 
+/**
+ * address_space_iommu_get: Get the backend IOMMU for the address space
+ *
+ * @as: the address space to fetch IOMMU from
+ */
+IOMMUObject *address_space_iommu_get(AddressSpace *as);
+
 #endif
 
 #endif
diff --git a/memory.c b/memory.c
index 6af523e..6aaad45 100644
--- a/memory.c
+++ b/memory.c
@@ -2500,6 +2500,14 @@  void address_space_destroy(AddressSpace *as)
     call_rcu(as, do_address_space_destroy, rcu);
 }
 
+IOMMUObject *address_space_iommu_get(AddressSpace *as)
+{
+    if (!as->as_ops.iommu_get) {
+        return NULL;
+    }
+    return as->as_ops.iommu_get(as);
+}
+
 static const char *memory_region_type(MemoryRegion *mr)
 {
     if (memory_region_is_ram_device(mr)) {