diff mbox

[v3,3/3] IOMMU: Integrate between VFIO and vIOMMU to support device assignment

Message ID 1463847590-22782-4-git-send-email-bd.aviv@gmail.com
State New
Headers show

Commit Message

Aviv B.D. May 21, 2016, 4:19 p.m. UTC
From: "Aviv Ben-David" <bd.aviv@gmail.com>

Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
---
 hw/i386/intel_iommu.c          | 69 ++++++++++++++++++++++++++++++++++++++++--
 hw/i386/intel_iommu_internal.h |  2 ++
 hw/vfio/common.c               | 11 +++++--
 include/hw/i386/intel_iommu.h  |  4 +++
 include/hw/vfio/vfio-common.h  |  1 +
 5 files changed, 81 insertions(+), 6 deletions(-)

Comments

Alex Williamson May 23, 2016, 5:53 p.m. UTC | #1
On Sat, 21 May 2016 19:19:50 +0300
"Aviv B.D" <bd.aviv@gmail.com> wrote:

> From: "Aviv Ben-David" <bd.aviv@gmail.com>
> 

Some commentary about the changes necessary to achieve $SUBJECT would
be nice here.

> Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> ---
>  hw/i386/intel_iommu.c          | 69 ++++++++++++++++++++++++++++++++++++++++--
>  hw/i386/intel_iommu_internal.h |  2 ++
>  hw/vfio/common.c               | 11 +++++--
>  include/hw/i386/intel_iommu.h  |  4 +++
>  include/hw/vfio/vfio-common.h  |  1 +
>  5 files changed, 81 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 410f810..128ec7c 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
>  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
>  #endif
>  
> +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> +                                    uint8_t devfn, VTDContextEntry *ce);
> +
>  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
>                              uint64_t wmask, uint64_t w1cmask)
>  {
> @@ -126,6 +129,22 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
>      return new_val;
>  }
>  
> +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, uint16_t * domain_id)
> +{
> +    VTDContextEntry ce;
> +    int ret_fr;
> +
> +    assert(domain_id);
> +
> +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> +    if (ret_fr){
> +        return -1;
> +    }
> +
> +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> +    return 0;
> +}
> +
>  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
>                                          uint64_t clear, uint64_t mask)
>  {
> @@ -724,9 +743,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
>      }
>  
>      if (!vtd_context_entry_present(ce)) {
> -        VTD_DPRINTF(GENERAL,
> -                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> -                    "is not present", devfn, bus_num);
>          return -VTD_FR_CONTEXT_ENTRY_P;
>      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
>                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> @@ -1033,18 +1049,58 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
>                                  &domain_id);
>  }
>  
> +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, uint16_t domain_id,
> +                                      hwaddr addr, uint8_t am)
> +{
> +    VFIOGuestIOMMU * giommu;
> +

VT-d parsing VFIO private data structures, nope this is not a good idea.

> +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> +        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);

VT-d needs to keep track of its own address spaces and call the iommu
notifier, it should have no visibility whatsoever that there are vfio
devices attached.

> +        uint16_t vfio_domain_id; 
> +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &vfio_domain_id);
> +        int i=0;
> +        if (!ret && domain_id == vfio_domain_id){
> +            IOMMUTLBEntry entry; 
> +            
> +            /* do vfio unmap */
> +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> +            entry.target_as = NULL;
> +            entry.iova = addr & VTD_PAGE_MASK_4K;
> +            entry.translated_addr = 0;
> +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> +            entry.perm = IOMMU_NONE;
> +            memory_region_notify_iommu(giommu->iommu, entry);
> +       
> +            /* do vfio map */
> +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> +            /* call to vtd_iommu_translate */
> +            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
> +                IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr, IOMMU_NO_FAIL); 
> +                if (entry.perm != IOMMU_NONE){
> +                    memory_region_notify_iommu(giommu->iommu, entry);
> +                }
> +            }
> +        }
> +    }
> +}
> +
>  static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
>                                        hwaddr addr, uint8_t am)
>  {
>      VTDIOTLBPageInvInfo info;
>  
>      assert(am <= VTD_MAMV);
> +    
>      info.domain_id = domain_id;
>      info.addr = addr;
>      info.mask = ~((1 << am) - 1);
> +    
>      g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
> +
> +    vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
>  }
>  
> +
>  /* Flush IOTLB
>   * Returns the IOTLB Actual Invalidation Granularity.
>   * @val: the content of the IOTLB_REG
> @@ -1912,6 +1968,13 @@ static Property vtd_properties[] = {
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu)
> +{
> +    VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);
> +    IntelIOMMUState *s = vtd_as->iommu_state;
> +    
> +    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
> +}
>  
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>  {
> diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> index ae40f73..102e9a5 100644
> --- a/hw/i386/intel_iommu_internal.h
> +++ b/hw/i386/intel_iommu_internal.h
> @@ -339,6 +339,8 @@ typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
>  #define VTD_PAGE_SHIFT_1G           30
>  #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
>  
> +#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
> +
>  struct VTDRootEntry {
>      uint64_t val;
>      uint64_t rsvd;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 88154a1..54fc8bc 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -35,6 +35,9 @@
>  #endif
>  #include "trace.h"
>  
> +#include "hw/sysbus.h"
> +#include "hw/i386/intel_iommu.h"
> +
>  struct vfio_group_head vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
>  struct vfio_as_head vfio_address_spaces =
> @@ -315,12 +318,12 @@ static void vfio_iommu_map_notify(Notifier *n, void *data)
>  out:
>      rcu_read_unlock();
>  }
> -
> +#if 0
>  static hwaddr vfio_container_granularity(VFIOContainer *container)
>  {
>      return (hwaddr)1 << ctz64(container->iova_pgsizes);
>  }
> -
> +#endif


Clearly this is unacceptable, the code has a purpose.

>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -384,11 +387,13 @@ static void vfio_listener_region_add(MemoryListener *listener,
>          giommu->n.notify = vfio_iommu_map_notify;
>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>  
> +        vtd_register_giommu(giommu);

vfio will not assume VT-d, this is why we register the notifier below.

>          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> +#if 0
>          memory_region_iommu_replay(giommu->iommu, &giommu->n,
>                                     vfio_container_granularity(container),
>                                     false);
> -
> +#endif

Clearly this also has a purpose.

>          return;
>      }
>  
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index b024ffa..22f3f83 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -23,6 +23,7 @@
>  #define INTEL_IOMMU_H
>  #include "hw/qdev.h"
>  #include "sysemu/dma.h"
> +#include "hw/vfio/vfio-common.h"

No.  This header probably should not have been put under include, VT-d
has no business walking our guest IOMMU list.

>  
>  #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>  #define INTEL_IOMMU_DEVICE(obj) \
> @@ -123,6 +124,8 @@ struct IntelIOMMUState {
>      MemoryRegionIOMMUOps iommu_ops;
>      GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
>      VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
> +
> +    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>  };
>  
>  /* Find the VTD Address space associated with the given bus pointer,
> @@ -130,4 +133,5 @@ struct IntelIOMMUState {
>   */
>  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
>  
> +void vtd_register_giommu(VFIOGuestIOMMU * giommu);
>  #endif
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index eb0e1b0..bf56a1d 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -92,6 +92,7 @@ typedef struct VFIOGuestIOMMU {
>      MemoryRegion *iommu;
>      Notifier n;
>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
> +    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;

No.  Use the existing interfaces, create your own address space
tracking in VT-d, we are not going to host a list for VT-d to use.
Also note that there's no consideration of hot-unplug support in these
changes.  vfio already works with guest iommus on powerpc, so any
change to vfio needs to be justified and generalized to a common
guest iommu api.  Thanks,

Alex

>  } VFIOGuestIOMMU;
>  
>  typedef struct VFIODeviceOps VFIODeviceOps;
Alex Williamson May 26, 2016, 8:58 p.m. UTC | #2
On Mon, 23 May 2016 11:53:42 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Sat, 21 May 2016 19:19:50 +0300
> "Aviv B.D" <bd.aviv@gmail.com> wrote:
> 
> > From: "Aviv Ben-David" <bd.aviv@gmail.com>
> >   
> 
> Some commentary about the changes necessary to achieve $SUBJECT would
> be nice here.
> 
> > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> > ---
> >  hw/i386/intel_iommu.c          | 69 ++++++++++++++++++++++++++++++++++++++++--
> >  hw/i386/intel_iommu_internal.h |  2 ++
> >  hw/vfio/common.c               | 11 +++++--
> >  include/hw/i386/intel_iommu.h  |  4 +++
> >  include/hw/vfio/vfio-common.h  |  1 +
> >  5 files changed, 81 insertions(+), 6 deletions(-)
> > 
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 410f810..128ec7c 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
> >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> >  #endif
> >  
> > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > +                                    uint8_t devfn, VTDContextEntry *ce);
> > +
> >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
> >                              uint64_t wmask, uint64_t w1cmask)
> >  {
> > @@ -126,6 +129,22 @@ static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
> >      return new_val;
> >  }
> >  
> > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, uint16_t * domain_id)
> > +{
> > +    VTDContextEntry ce;
> > +    int ret_fr;
> > +
> > +    assert(domain_id);
> > +
> > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > +    if (ret_fr){
> > +        return -1;
> > +    }
> > +
> > +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> > +    return 0;
> > +}
> > +
> >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
> >                                          uint64_t clear, uint64_t mask)
> >  {
> > @@ -724,9 +743,6 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> >      }
> >  
> >      if (!vtd_context_entry_present(ce)) {
> > -        VTD_DPRINTF(GENERAL,
> > -                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > -                    "is not present", devfn, bus_num);
> >          return -VTD_FR_CONTEXT_ENTRY_P;
> >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > @@ -1033,18 +1049,58 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
> >                                  &domain_id);
> >  }
> >  
> > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, uint16_t domain_id,
> > +                                      hwaddr addr, uint8_t am)
> > +{
> > +    VFIOGuestIOMMU * giommu;
> > +  
> 
> VT-d parsing VFIO private data structures, nope this is not a good idea.
> 
> > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);  
> 
> VT-d needs to keep track of its own address spaces and call the iommu
> notifier, it should have no visibility whatsoever that there are vfio
> devices attached.
> 
> > +        uint16_t vfio_domain_id; 
> > +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &vfio_domain_id);
> > +        int i=0;
> > +        if (!ret && domain_id == vfio_domain_id){
> > +            IOMMUTLBEntry entry; 
> > +            
> > +            /* do vfio unmap */
> > +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
> > +            entry.target_as = NULL;
> > +            entry.iova = addr & VTD_PAGE_MASK_4K;
> > +            entry.translated_addr = 0;
> > +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> > +            entry.perm = IOMMU_NONE;
> > +            memory_region_notify_iommu(giommu->iommu, entry);
> > +       
> > +            /* do vfio map */
> > +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
> > +            /* call to vtd_iommu_translate */
> > +            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
> > +                IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr, IOMMU_NO_FAIL); 
> > +                if (entry.perm != IOMMU_NONE){
> > +                    memory_region_notify_iommu(giommu->iommu, entry);
> > +                }
> > +            }
> > +        }
> > +    }
> > +}
> > +


I think I see what you're trying to do here, find the VTDAddressSpaces
that vfio knows about and determine which are affected by an
invalidation, but I think it really just represents a flaw in the VT-d
code not really matching real hardware.  As I understand the real
hardware, per device translations use the bus:devfn to get the context
entry.  The context entry contains both the domain_id and a pointer to
the page table.  Note that VT-d requires that domain_id and page table
pointers are consistent for all entries, there cannot be two separate
context entries with the same domain_id that point to different page
tables. So really, VTDAddressSpace should be based at the domain, not
the context entry.  Multiple context entries can point to the same
domain, and thus the same VTDAddressSpace.  Doing so would make the
invalidation trivial, simply lookup the VTDAddressSpace based on the
domain_id, get the MemoryRegion, and fire off
memory_region_notify_iommu()s, which then get {un}mapped through vfio
without any direct interaction.  Unfortunately I think that means that
intel-iommu needs some data structure surgery.

With the current code, I think the best we could do would be to look
through every context for matching domain_ids.  For each one, use that
bus:devfn to get the VTDAddressSpace and thus the MemoryRegion and
call a notify for each.  That's not only an inefficient lookup, but
requires a notify per matching bus:devfn since each uses a separate
VTDAddressSpace when we should really only need a notify per domain_id. 

Also, I haven't figured it out yet, but I think we're also going to
need to actually populate the MemoryRegion rather than just use it to
send notifies, otherwise replay won't work, which means that a device
added to a domain with existing mappings wouldn't work.  Thanks,

Alex
Aviv B.D. May 28, 2016, 10:52 a.m. UTC | #3
Hi,
Your idea to search the relevent VTDAddressSpace and call it's notifier
will
probably work. Next week I'll try to implement it (for now with the costly
scan
of each context).
I still not sure if populating the MemoryRegion will suffice for hot plug
vfio
device but i'll try to look into it.

As far as I understand the memory_region_iommu_replay function, it still
scans
the whole 64bit address space, and therefore may hang the VM for a long
time.

Aviv.

On Thu, May 26, 2016 at 11:58 PM Alex Williamson <alex.williamson@redhat.com>
wrote:

> On Mon, 23 May 2016 11:53:42 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
>
> > On Sat, 21 May 2016 19:19:50 +0300
> > "Aviv B.D" <bd.aviv@gmail.com> wrote:
> >
> > > From: "Aviv Ben-David" <bd.aviv@gmail.com>
> > >
> >
> > Some commentary about the changes necessary to achieve $SUBJECT would
> > be nice here.
> >
> > > Signed-off-by: Aviv Ben-David <bd.aviv@gmail.com>
> > > ---
> > >  hw/i386/intel_iommu.c          | 69
> ++++++++++++++++++++++++++++++++++++++++--
> > >  hw/i386/intel_iommu_internal.h |  2 ++
> > >  hw/vfio/common.c               | 11 +++++--
> > >  include/hw/i386/intel_iommu.h  |  4 +++
> > >  include/hw/vfio/vfio-common.h  |  1 +
> > >  5 files changed, 81 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > index 410f810..128ec7c 100644
> > > --- a/hw/i386/intel_iommu.c
> > > +++ b/hw/i386/intel_iommu.c
> > > @@ -43,6 +43,9 @@ static int vtd_dbgflags = VTD_DBGBIT(GENERAL) |
> VTD_DBGBIT(CSR);
> > >  #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
> > >  #endif
> > >
> > > +static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t
> bus_num,
> > > +                                    uint8_t devfn, VTDContextEntry
> *ce);
> > > +
> > >  static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t
> val,
> > >                              uint64_t wmask, uint64_t w1cmask)
> > >  {
> > > @@ -126,6 +129,22 @@ static uint32_t
> vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
> > >      return new_val;
> > >  }
> > >
> > > +static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num,
> uint8_t devfn, uint16_t * domain_id)
> > > +{
> > > +    VTDContextEntry ce;
> > > +    int ret_fr;
> > > +
> > > +    assert(domain_id);
> > > +
> > > +    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
> > > +    if (ret_fr){
> > > +        return -1;
> > > +    }
> > > +
> > > +    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
> > > +    return 0;
> > > +}
> > > +
> > >  static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr
> addr,
> > >                                          uint64_t clear, uint64_t mask)
> > >  {
> > > @@ -724,9 +743,6 @@ static int
> vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
> > >      }
> > >
> > >      if (!vtd_context_entry_present(ce)) {
> > > -        VTD_DPRINTF(GENERAL,
> > > -                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
> > > -                    "is not present", devfn, bus_num);
> > >          return -VTD_FR_CONTEXT_ENTRY_P;
> > >      } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
> > >                 (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
> > > @@ -1033,18 +1049,58 @@ static void
> vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
> > >                                  &domain_id);
> > >  }
> > >
> > > +static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s,
> uint16_t domain_id,
> > > +                                      hwaddr addr, uint8_t am)
> > > +{
> > > +    VFIOGuestIOMMU * giommu;
> > > +
> >
> > VT-d parsing VFIO private data structures, nope this is not a good idea.
> >
> > > +    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
> > > +        VTDAddressSpace *vtd_as = container_of(giommu->iommu,
> VTDAddressSpace, iommu);
> >
> > VT-d needs to keep track of its own address spaces and call the iommu
> > notifier, it should have no visibility whatsoever that there are vfio
> > devices attached.
> >
> > > +        uint16_t vfio_domain_id;
> > > +        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus),
> vtd_as->devfn, &vfio_domain_id);
> > > +        int i=0;
> > > +        if (!ret && domain_id == vfio_domain_id){
> > > +            IOMMUTLBEntry entry;
> > > +
> > > +            /* do vfio unmap */
> > > +            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d",
> addr, am);
> > > +            entry.target_as = NULL;
> > > +            entry.iova = addr & VTD_PAGE_MASK_4K;
> > > +            entry.translated_addr = 0;
> > > +            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
> > > +            entry.perm = IOMMU_NONE;
> > > +            memory_region_notify_iommu(giommu->iommu, entry);
> > > +
> > > +            /* do vfio map */
> > > +            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d",
> addr, am);
> > > +            /* call to vtd_iommu_translate */
> > > +            for (i = 0; i < (1 << am); i++, addr+=(1 <<
> VTD_PAGE_SHIFT_4K)){
> > > +                IOMMUTLBEntry entry =
> s->iommu_ops.translate(giommu->iommu, addr, IOMMU_NO_FAIL);
> > > +                if (entry.perm != IOMMU_NONE){
> > > +                    memory_region_notify_iommu(giommu->iommu, entry);
> > > +                }
> > > +            }
> > > +        }
> > > +    }
> > > +}
> > > +
>
>
> I think I see what you're trying to do here, find the VTDAddressSpaces
> that vfio knows about and determine which are affected by an
> invalidation, but I think it really just represents a flaw in the VT-d
> code not really matching real hardware.  As I understand the real
> hardware, per device translations use the bus:devfn to get the context
> entry.  The context entry contains both the domain_id and a pointer to
> the page table.  Note that VT-d requires that domain_id and page table
> pointers are consistent for all entries, there cannot be two separate
> context entries with the same domain_id that point to different page
> tables. So really, VTDAddressSpace should be based at the domain, not
> the context entry.  Multiple context entries can point to the same
> domain, and thus the same VTDAddressSpace.  Doing so would make the
> invalidation trivial, simply lookup the VTDAddressSpace based on the
> domain_id, get the MemoryRegion, and fire off
> memory_region_notify_iommu()s, which then get {un}mapped through vfio
> without any direct interaction.  Unfortunately I think that means that
> intel-iommu needs some data structure surgery.
>
> With the current code, I think the best we could do would be to look
> through every context for matching domain_ids.  For each one, use that
> bus:devfn to get the VTDAddressSpace and thus the MemoryRegion and
> call a notify for each.  That's not only an inefficient lookup, but
> requires a notify per matching bus:devfn since each uses a separate
> VTDAddressSpace when we should really only need a notify per domain_id.
>
> Also, I haven't figured it out yet, but I think we're also going to
> need to actually populate the MemoryRegion rather than just use it to
> send notifies, otherwise replay won't work, which means that a device
> added to a domain with existing mappings wouldn't work.  Thanks,
>
> Alex
>
Alex Williamson May 28, 2016, 4:02 p.m. UTC | #4
On Sat, 28 May 2016 10:52:58 +0000
"Aviv B.D." <bd.aviv@gmail.com> wrote:

> Hi,
> Your idea to search the relevent VTDAddressSpace and call it's notifier
> will
> probably work. Next week I'll try to implement it (for now with the costly
> scan
> of each context).

I think an optimization we can make is to use pci_for_each_bus() and
pci_for_each_device() to scan only context entries where devices are
present.  Then for each context entry, retrieve the DID, if it matches
the invalidation domain_id, retrieve the VTDAddressSpace and perform a
memory_region_notify_iommu() using VTDAddressSpace.iommu.  Still
horribly inefficient, but an improvement over walking all context
entries and avoids gratuitous callbacks between unrelated drivers in
QEMU.

Overall, I have very little faith that this will be the only change
required to make this work though.  For instance, if a device is added
or removed from a domain, where is that accounted for?  Ideally this
should trigger the region_add/region_del listener callbacks, but I
don't see how that works with how VT-d creates a fixed VTDAddressSpace
per device, and in fact how our QEMU memory model doesn't allow the
address space of a device to be dynamically aliased against other
address spaces or really changed at all. 

> I still not sure if populating the MemoryRegion will suffice for hot plug
> vfio
> device but i'll try to look into it.
> 
> As far as I understand the memory_region_iommu_replay function, it still
> scans
> the whole 64bit address space, and therefore may hang the VM for a long
> time.

Then we need to fix that problem, one option might be to make a replay
callback on MemoryRegionIOMMUOps that walks the page tables for a given
context entry rather than blindly traversing a 64bit address space.  We
can't simply ignore the issue by #ifdef'ing out the code.  I suspect
there's a lot more involved to make VT-d interact properly with a
physical device than what's been proposed so far.  At every
invalidation, we need to figure out what's changed and update the host
mappings.  We also need better, more dynamic address space management
to make the virtual hardware reflect physical hardware when we enable
things like passthrough mode or have multiple devices sharing an iommu
domain.  I think we're just barely scratching the surface here.  Thanks,

Alex
Aviv B.D. May 28, 2016, 4:10 p.m. UTC | #5
On Sat, May 28, 2016 at 7:02 PM Alex Williamson <alex.williamson@redhat.com>
wrote:

> On Sat, 28 May 2016 10:52:58 +0000
> "Aviv B.D." <bd.aviv@gmail.com> wrote:
>
> > Hi,
> > Your idea to search the relevent VTDAddressSpace and call it's notifier
> > will
> > probably work. Next week I'll try to implement it (for now with the
> costly
> > scan
> > of each context).
>
> I think an optimization we can make is to use pci_for_each_bus() and
> pci_for_each_device() to scan only context entries where devices are
> present.  Then for each context entry, retrieve the DID, if it matches
> the invalidation domain_id, retrieve the VTDAddressSpace and perform a
> memory_region_notify_iommu() using VTDAddressSpace.iommu.  Still
> horribly inefficient, but an improvement over walking all context
> entries and avoids gratuitous callbacks between unrelated drivers in
> QEMU.
>

Thanks for the references on how I can do it. :)

>
> Overall, I have very little faith that this will be the only change
> required to make this work though.  For instance, if a device is added
> or removed from a domain, where is that accounted for?  Ideally this
> should trigger the region_add/region_del listener callbacks, but I
> don't see how that works with how VT-d creates a fixed VTDAddressSpace
> per device, and in fact how our QEMU memory model doesn't allow the
> address space of a device to be dynamically aliased against other
> address spaces or really changed at all.
>
> > I still not sure if populating the MemoryRegion will suffice for hot plug
> > vfio
> > device but i'll try to look into it.
> >
> > As far as I understand the memory_region_iommu_replay function, it still
> > scans
> > the whole 64bit address space, and therefore may hang the VM for a long
> > time.
>
> Then we need to fix that problem, one option might be to make a replay
> callback on MemoryRegionIOMMUOps that walks the page tables for a given
> context entry rather than blindly traversing a 64bit address space.  We
> can't simply ignore the issue by #ifdef'ing out the code.  I suspect
> there's a lot more involved to make VT-d interact properly with a
> physical device than what's been proposed so far.  At every
> invalidation, we need to figure out what's changed and update the host
> mappings.  We also need better, more dynamic address space management
> to make the virtual hardware reflect physical hardware when we enable
> things like passthrough mode or have multiple devices sharing an iommu
> domain.  I think we're just barely scratching the surface here.  Thanks,
>
> Alex
>


I agree with you regarding hotplug, therefore I only ifdef this code out
and didn't
delete it. With the call to memory_region_iommu_replay QEMU hangs on startup
with a very long loop that prevent any device assignment  with vIOMMU
enabled.

I'm hoping not to enlarge the scope of this patch to include hotplug device
assignment
with iommu enabled.

Thanks,
Aviv.
Alex Williamson May 28, 2016, 5:39 p.m. UTC | #6
On Sat, 28 May 2016 16:10:55 +0000
"Aviv B.D." <bd.aviv@gmail.com> wrote:

> On Sat, May 28, 2016 at 7:02 PM Alex Williamson <alex.williamson@redhat.com>
> wrote:
> 
> > On Sat, 28 May 2016 10:52:58 +0000
> > "Aviv B.D." <bd.aviv@gmail.com> wrote:
> >  
> > > Hi,
> > > Your idea to search the relevent VTDAddressSpace and call it's notifier
> > > will
> > > probably work. Next week I'll try to implement it (for now with the  
> > costly  
> > > scan
> > > of each context).  
> >
> > I think an optimization we can make is to use pci_for_each_bus() and
> > pci_for_each_device() to scan only context entries where devices are
> > present.  Then for each context entry, retrieve the DID, if it matches
> > the invalidation domain_id, retrieve the VTDAddressSpace and perform a
> > memory_region_notify_iommu() using VTDAddressSpace.iommu.  Still
> > horribly inefficient, but an improvement over walking all context
> > entries and avoids gratuitous callbacks between unrelated drivers in
> > QEMU.
> >  
> 
> Thanks for the references on how I can do it. :)
> 
> >
> > Overall, I have very little faith that this will be the only change
> > required to make this work though.  For instance, if a device is added
> > or removed from a domain, where is that accounted for?  Ideally this
> > should trigger the region_add/region_del listener callbacks, but I
> > don't see how that works with how VT-d creates a fixed VTDAddressSpace
> > per device, and in fact how our QEMU memory model doesn't allow the
> > address space of a device to be dynamically aliased against other
> > address spaces or really changed at all.
> >  
> > > I still not sure if populating the MemoryRegion will suffice for hot plug
> > > vfio
> > > device but i'll try to look into it.
> > >
> > > As far as I understand the memory_region_iommu_replay function, it still
> > > scans
> > > the whole 64bit address space, and therefore may hang the VM for a long
> > > time.  
> >
> > Then we need to fix that problem, one option might be to make a replay
> > callback on MemoryRegionIOMMUOps that walks the page tables for a given
> > context entry rather than blindly traversing a 64bit address space.  We
> > can't simply ignore the issue by #ifdef'ing out the code.  I suspect
> > there's a lot more involved to make VT-d interact properly with a
> > physical device than what's been proposed so far.  At every
> > invalidation, we need to figure out what's changed and update the host
> > mappings.  We also need better, more dynamic address space management
> > to make the virtual hardware reflect physical hardware when we enable
> > things like passthrough mode or have multiple devices sharing an iommu
> > domain.  I think we're just barely scratching the surface here.  Thanks,
> >
> > Alex
> >  
> 
> 
> I agree with you regarding hotplug, therefore I only ifdef this code out
> and didn't
> delete it. With the call to memory_region_iommu_replay QEMU hangs on startup
> with a very long loop that prevent any device assignment  with vIOMMU
> enabled.
> 
> I'm hoping not to enlarge the scope of this patch to include hotplug device
> assignment
> with iommu enabled.

It's not just hotplug, any case where an existing domain can be applied
to a device.  The series is incomplete without such support and I won't
accept any changes into vfio that disables code that's correct in other
contexts.  Thanks,

Alex
Aviv B.D. May 28, 2016, 6:14 p.m. UTC | #7
Hi,
As far as I tested the disabled code (call to memory_region_iommu_replay)
hangup
QEMU on startup if IOMMU is enabled (scaning 64 bit address space takes
more
than an hour on modern hardware) , at least on x86 hardware. So the code is
not 100%
correct for any context. Maybe it just should be disabled for x86
architecture?

By specification any such behavior of applying a domain to device should
include
cache invalidation if CM flag is present so I'm not thinking that my patch
break
this scenario.

Thanks,
Aviv.

On Sat, May 28, 2016 at 8:39 PM Alex Williamson <alex.williamson@redhat.com>
wrote:

> On Sat, 28 May 2016 16:10:55 +0000
> "Aviv B.D." <bd.aviv@gmail.com> wrote:
>
> > On Sat, May 28, 2016 at 7:02 PM Alex Williamson <
> alex.williamson@redhat.com>
> > wrote:
> >
> > > On Sat, 28 May 2016 10:52:58 +0000
> > > "Aviv B.D." <bd.aviv@gmail.com> wrote:
> > >
> > > > Hi,
> > > > Your idea to search the relevent VTDAddressSpace and call it's
> notifier
> > > > will
> > > > probably work. Next week I'll try to implement it (for now with the
> > > costly
> > > > scan
> > > > of each context).
> > >
> > > I think an optimization we can make is to use pci_for_each_bus() and
> > > pci_for_each_device() to scan only context entries where devices are
> > > present.  Then for each context entry, retrieve the DID, if it matches
> > > the invalidation domain_id, retrieve the VTDAddressSpace and perform a
> > > memory_region_notify_iommu() using VTDAddressSpace.iommu.  Still
> > > horribly inefficient, but an improvement over walking all context
> > > entries and avoids gratuitous callbacks between unrelated drivers in
> > > QEMU.
> > >
> >
> > Thanks for the references on how I can do it. :)
> >
> > >
> > > Overall, I have very little faith that this will be the only change
> > > required to make this work though.  For instance, if a device is added
> > > or removed from a domain, where is that accounted for?  Ideally this
> > > should trigger the region_add/region_del listener callbacks, but I
> > > don't see how that works with how VT-d creates a fixed VTDAddressSpace
> > > per device, and in fact how our QEMU memory model doesn't allow the
> > > address space of a device to be dynamically aliased against other
> > > address spaces or really changed at all.
> > >
> > > > I still not sure if populating the MemoryRegion will suffice for hot
> plug
> > > > vfio
> > > > device but i'll try to look into it.
> > > >
> > > > As far as I understand the memory_region_iommu_replay function, it
> still
> > > > scans
> > > > the whole 64bit address space, and therefore may hang the VM for a
> long
> > > > time.
> > >
> > > Then we need to fix that problem, one option might be to make a replay
> > > callback on MemoryRegionIOMMUOps that walks the page tables for a given
> > > context entry rather than blindly traversing a 64bit address space.  We
> > > can't simply ignore the issue by #ifdef'ing out the code.  I suspect
> > > there's a lot more involved to make VT-d interact properly with a
> > > physical device than what's been proposed so far.  At every
> > > invalidation, we need to figure out what's changed and update the host
> > > mappings.  We also need better, more dynamic address space management
> > > to make the virtual hardware reflect physical hardware when we enable
> > > things like passthrough mode or have multiple devices sharing an iommu
> > > domain.  I think we're just barely scratching the surface here.
> Thanks,
> > >
> > > Alex
> > >
> >
> >
> > I agree with you regarding hotplug, therefore I only ifdef this code out
> > and didn't
> > delete it. With the call to memory_region_iommu_replay QEMU hangs on
> startup
> > with a very long loop that prevent any device assignment  with vIOMMU
> > enabled.
> >
> > I'm hoping not to enlarge the scope of this patch to include hotplug
> device
> > assignment
> > with iommu enabled.
>
> It's not just hotplug, any case where an existing domain can be applied
> to a device.  The series is incomplete without such support and I won't
> accept any changes into vfio that disables code that's correct in other
> contexts.  Thanks,
>
> Alex
>
Alex Williamson May 28, 2016, 7:48 p.m. UTC | #8
On Sat, 28 May 2016 18:14:18 +0000
"Aviv B.D." <bd.aviv@gmail.com> wrote:

> Hi,
> As far as I tested the disabled code (call to memory_region_iommu_replay)
> hangup
> QEMU on startup if IOMMU is enabled (scaning 64 bit address space takes
> more
> than an hour on modern hardware) , at least on x86 hardware. So the code is
> not 100%
> correct for any context. Maybe it just should be disabled for x86
> architecture?
> 
> By specification any such behavior of applying a domain to device should
> include
> cache invalidation if CM flag is present so I'm not thinking that my patch
> break
> this scenario.

The functionality is completely necessary, imagine moving a device from
an IOMMU API domain in the guest back to the passthrough domain, if
there is no replay of the IOMMU context, the device cannot perform any
DMA at all.  The current replay mechanism is obviously not designed for
iterating over every page of a 64bit address space, which is why I
suggest a replay callback on MemoryRegionIOMMUOps so that VT-d can
optimize the replay by walking the VT-d page tables and perhaps
implementation of hardware passthrough mode and the ability to
dynamically switch a device to address_space_memory.  The current
replay code is correct and functional in a context with a window based
IOMMU where the IOMMU address space is much smaller.  We cannot have
correct operation without a mechanism to rebuild the host IOMMU context
when a device is switched to a new domain.  Please address it.  Thanks,

Alex
Aviv B.D. June 2, 2016, 1:09 p.m. UTC | #9
Hi,

In case of hot plug vfio device there should not be any active mapping
to this device prior the device addition. Also before it added to a guest
the guest should not attach the device to any domain as the device is not
present.
With CM enabled the guest must invalidate the domain or individual mappings
that belong to this new device before any use of those maps.

I'm still not sure that this functionality is necessary in x86 and
currently there
is a scenario (for x86) that use this functionality.

Thanks,
Aviv.

On Sat, May 28, 2016 at 10:48 PM Alex Williamson <alex.williamson@redhat.com>
wrote:

> On Sat, 28 May 2016 18:14:18 +0000
> "Aviv B.D." <bd.aviv@gmail.com> wrote:
>
> > Hi,
> > As far as I tested the disabled code (call to memory_region_iommu_replay)
> > hangup
> > QEMU on startup if IOMMU is enabled (scaning 64 bit address space takes
> > more
> > than an hour on modern hardware) , at least on x86 hardware. So the code
> is
> > not 100%
> > correct for any context. Maybe it just should be disabled for x86
> > architecture?
> >
> > By specification any such behavior of applying a domain to device should
> > include
> > cache invalidation if CM flag is present so I'm not thinking that my
> patch
> > break
> > this scenario.
>
> The functionality is completely necessary, imagine moving a device from
> an IOMMU API domain in the guest back to the passthrough domain, if
> there is no replay of the IOMMU context, the device cannot perform any
> DMA at all.  The current replay mechanism is obviously not designed for
> iterating over every page of a 64bit address space, which is why I
> suggest a replay callback on MemoryRegionIOMMUOps so that VT-d can
> optimize the replay by walking the VT-d page tables and perhaps
> implementation of hardware passthrough mode and the ability to
> dynamically switch a device to address_space_memory.  The current
> replay code is correct and functional in a context with a window based
> IOMMU where the IOMMU address space is much smaller.  We cannot have
> correct operation without a mechanism to rebuild the host IOMMU context
> when a device is switched to a new domain.  Please address it.  Thanks,
>
> Alex
>
Alex Williamson June 2, 2016, 1:34 p.m. UTC | #10
On Thu, 02 Jun 2016 13:09:27 +0000
"Aviv B.D." <bd.aviv@gmail.com> wrote:

> Hi,
> 
> In case of hot plug vfio device there should not be any active mapping
> to this device prior the device addition.

Counter example - a device is hot added to a guest booted with iommu=pt.

> Also before it added to a guest
> the guest should not attach the device to any domain as the device is not
> present.

The static identity map domain (ie. passthrough domain) can precede the
device existing.

> With CM enabled the guest must invalidate the domain or individual mappings
> that belong to this new device before any use of those maps.
> 
> I'm still not sure that this functionality is necessary in x86 and
> currently there
> is a scenario (for x86) that use this functionality.

Clearly I disagree, it is necessary.  Thanks,

Alex
Peter Xu June 6, 2016, 7:38 a.m. UTC | #11
Some questions not quite related to this patch content but vfio...

On Mon, May 23, 2016 at 11:53:42AM -0600, Alex Williamson wrote:
> On Sat, 21 May 2016 19:19:50 +0300
> "Aviv B.D" <bd.aviv@gmail.com> wrote:

[...]

> > +#if 0
> >  static hwaddr vfio_container_granularity(VFIOContainer *container)
> >  {
> >      return (hwaddr)1 << ctz64(container->iova_pgsizes);
> >  }
> > -
> > +#endif

Here we are fetching the smallest page size that host IOMMU support,
so even if host IOMMU support large pages, it will not be used as long
as guest enabled vIOMMU, right?

> 
> 
> Clearly this is unacceptable, the code has a purpose.
> 
> >  static void vfio_listener_region_add(MemoryListener *listener,
> >                                       MemoryRegionSection *section)
> >  {
> > @@ -384,11 +387,13 @@ static void vfio_listener_region_add(MemoryListener *listener,
> >          giommu->n.notify = vfio_iommu_map_notify;
> >          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> >  
> > +        vtd_register_giommu(giommu);
> 
> vfio will not assume VT-d, this is why we register the notifier below.
> 
> >          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> > +#if 0
> >          memory_region_iommu_replay(giommu->iommu, &giommu->n,
> >                                     vfio_container_granularity(container),
> >                                     false);

For memory_region_iommu_replay(), we are using
vfio_container_granularity() as the granularity, which is the host
IOMMU page size. However inside it:

void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n,
                                hwaddr granularity, bool is_write)
{
    hwaddr addr;
    IOMMUTLBEntry iotlb;

    for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
        iotlb = mr->iommu_ops->translate(mr, addr, is_write);
        if (iotlb.perm != IOMMU_NONE) {
            n->notify(n, &iotlb);
        }

        /* if (2^64 - MR size) < granularity, it's possible to get an
         * infinite loop here.  This should catch such a wraparound */
        if ((addr + granularity) < addr) {
            break;
        }
    }
}

Is it possible that iotlb mapped to a large page (or any page that is
not the same as granularity)? The above code should have assumed that
host/guest IOMMU are having the same page size == granularity?

Thanks,

-- peterx
Peter Xu June 6, 2016, 8:09 a.m. UTC | #12
On Thu, Jun 02, 2016 at 07:34:17AM -0600, Alex Williamson wrote:
> On Thu, 02 Jun 2016 13:09:27 +0000
> "Aviv B.D." <bd.aviv@gmail.com> wrote:
> 
> > Hi,
> > 
> > In case of hot plug vfio device there should not be any active mapping
> > to this device prior the device addition.
> 
> Counter example - a device is hot added to a guest booted with iommu=pt.

I got the same question with Aviv...

For hot-plug devices, even if it is using iommu=pt, shouldn't it still
follow the steps that first init vfio device, then configure device
context entry? Let me list the steps for device addition in case I got
any mistake:

1. user add new VFIO device A

2. vfio_listener_region_add() called for device A on the IOMMU mr,
   here we should create the iommu notifier. However since the context
   entry still does not exist, memory_region_iommu_replay() will got
   all invalid IOTLB (IOMMU_NONE entries)

3. guest kernel found the device, enabled the device, filled in
   context entry for device A with "pass-through" (so the SLPTPTR is
   invalid)

4. guest sent context invalidation to QEMU vIOMMU since we have CM=1
   set for guest vIOMMU

5. QEMU vIOMMU handle the invalidation, trigger VFIO notify to do
   correct VFIO mapping for device A

Though here step 5 should still be missing (IIUC Aviv's patch 3 still
not handled context invalidation). Just want to know whether we can
avoid the replay operation for Intel vIOMMUs (for Intel only, because
Intel has context invalidation and cache mode support, not sure about
other platform)?

Thanks,

-- peterx
Alex Williamson June 6, 2016, 5:30 p.m. UTC | #13
On Mon, 6 Jun 2016 15:38:25 +0800
Peter Xu <peterx@redhat.com> wrote:

> Some questions not quite related to this patch content but vfio...
> 
> On Mon, May 23, 2016 at 11:53:42AM -0600, Alex Williamson wrote:
> > On Sat, 21 May 2016 19:19:50 +0300
> > "Aviv B.D" <bd.aviv@gmail.com> wrote:  
> 
> [...]
> 
> > > +#if 0
> > >  static hwaddr vfio_container_granularity(VFIOContainer *container)
> > >  {
> > >      return (hwaddr)1 << ctz64(container->iova_pgsizes);
> > >  }
> > > -
> > > +#endif  
> 
> Here we are fetching the smallest page size that host IOMMU support,
> so even if host IOMMU support large pages, it will not be used as long
> as guest enabled vIOMMU, right?

Not using this replay mechanism, correct.  AFAIK, this replay code has
only been tested on POWER where the window is much, much smaller than
the 64bit address space and hugepages are not supported.  A replay
callback into the iommu could could not only walk the address space
more efficiently, but also attempt to map with hugepages.  It would
however need to be cautious not to coalesce separate mappings by the
guest into a single mapping through vfio, or else we're going to have
inconsistency for mapping vs unmapping that vfio does not expect or
support.
 
> > 
> > 
> > Clearly this is unacceptable, the code has a purpose.
> >   
> > >  static void vfio_listener_region_add(MemoryListener *listener,
> > >                                       MemoryRegionSection *section)
> > >  {
> > > @@ -384,11 +387,13 @@ static void vfio_listener_region_add(MemoryListener *listener,
> > >          giommu->n.notify = vfio_iommu_map_notify;
> > >          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
> > >  
> > > +        vtd_register_giommu(giommu);  
> > 
> > vfio will not assume VT-d, this is why we register the notifier below.
> >   
> > >          memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
> > > +#if 0
> > >          memory_region_iommu_replay(giommu->iommu, &giommu->n,
> > >                                     vfio_container_granularity(container),
> > >                                     false);  
> 
> For memory_region_iommu_replay(), we are using
> vfio_container_granularity() as the granularity, which is the host
> IOMMU page size. However inside it:
> 
> void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n,
>                                 hwaddr granularity, bool is_write)
> {
>     hwaddr addr;
>     IOMMUTLBEntry iotlb;
> 
>     for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
>         iotlb = mr->iommu_ops->translate(mr, addr, is_write);
>         if (iotlb.perm != IOMMU_NONE) {
>             n->notify(n, &iotlb);
>         }
> 
>         /* if (2^64 - MR size) < granularity, it's possible to get an
>          * infinite loop here.  This should catch such a wraparound */
>         if ((addr + granularity) < addr) {
>             break;
>         }
>     }
> }
> 
> Is it possible that iotlb mapped to a large page (or any page that is
> not the same as granularity)? The above code should have assumed that
> host/guest IOMMU are having the same page size == granularity?

I think this is answered above.  This is not remotely efficient code
for a real 64bit IOMMU (BTW, VT-d does not support the full 64bit
address space either, I believe it's more like 48bits) and is not going
to replay hugepages, but it will give us sufficiently correct IOMMU
entries... eventually. Thanks,

Alex
Alex Williamson June 6, 2016, 6:21 p.m. UTC | #14
On Mon, 6 Jun 2016 16:09:11 +0800
Peter Xu <peterx@redhat.com> wrote:

> On Thu, Jun 02, 2016 at 07:34:17AM -0600, Alex Williamson wrote:
> > On Thu, 02 Jun 2016 13:09:27 +0000
> > "Aviv B.D." <bd.aviv@gmail.com> wrote:
> >   
> > > Hi,
> > > 
> > > In case of hot plug vfio device there should not be any active mapping
> > > to this device prior the device addition.  
> > 
> > Counter example - a device is hot added to a guest booted with iommu=pt.  
> 
> I got the same question with Aviv...
> 
> For hot-plug devices

First, let's just remove "hot-plug" from this discussion because I'm
afraid someone is going to say "let's just not support hotplug
devices".  The issue of moving a device to a pre-populated domain can
occur any time a device attaches to a new domain.  It might occur if a
device is added to a vfio driver in the L1 guest where it's already
managing a device.  It might occur any time the device is released from
an L1 vfio user and returned to the static identity map in the L1 guest
kernel.

>, even if it is using iommu=pt, shouldn't it still
> follow the steps that first init vfio device, then configure device
> context entry? Let me list the steps for device addition in case I got
> any mistake:
> 
> 1. user add new VFIO device A
> 
> 2. vfio_listener_region_add() called for device A on the IOMMU mr,
>    here we should create the iommu notifier. However since the context
>    entry still does not exist, memory_region_iommu_replay() will got
>    all invalid IOTLB (IOMMU_NONE entries)
> 
> 3. guest kernel found the device, enabled the device, filled in
>    context entry for device A with "pass-through" (so the SLPTPTR is
>    invalid)
> 
> 4. guest sent context invalidation to QEMU vIOMMU since we have CM=1
>    set for guest vIOMMU
> 
> 5. QEMU vIOMMU handle the invalidation, trigger VFIO notify to do
>    correct VFIO mapping for device A
> 
> Though here step 5 should still be missing (IIUC Aviv's patch 3 still
> not handled context invalidation). Just want to know whether we can
> avoid the replay operation for Intel vIOMMUs (for Intel only, because
> Intel has context invalidation and cache mode support, not sure about
> other platform)?

I'm not sure why you're so eager to avoid implementing a replay
callback for VT-d.  What happens when VT-d is not enabled by the
guest?  vfio/pci.c calls pci_device_iommu_address_space() to get the
address space for the device, which results in vtd_find_add_as() which
gives us a unique VTDAddressSpace.as for that device.  With VT-d not in
use by the guest, when do steps 3-5 occur?  I agree with your reasoning
when VT-d is enabled, but the BIOS/UEFI does not enable VT-d, so are we
going to exclude all use cases of an assigned device prior to the guest
enabling VT-d?

On that same topic, I'm really dubious that we have the flexibility in
our memory API to really support an IOMMU like VT-d and the layout of
having a VTDAddressSpace per device, rather than exposing shared
address spaces, has implications on the efficiency and locked memory
requirements for vfio.  In the above case with VT-d disabled, the
VTDAddressSpace should alias to the system memory AddressSpace and
dynamically switch to a per device address space when VT-d is enabled.
AFAICT, we don't have anything resembling that sort of feature, so we
rely on the IOMMU driver to replay, perhaps even configuring its own
MemoryListener to IOMMU notifier gateway, which is also something that
doesn't currently exist.

Additionally, if there are things unique to VT-d, for instance if VT-d
is enabled and we can rely on the sequence of events you've set forth,
then yes, the replay mechanism should do nothing.  But only the VT-d
code can decide that, which implies that vfio always needs to call the
replay function and a new MemoryRegionIOMMUOps callback needs to decide
what to do given the current state of the vIOMMU.  Thanks,

Alex
Peter Xu June 7, 2016, 1:20 p.m. UTC | #15
On Mon, Jun 06, 2016 at 12:21:34PM -0600, Alex Williamson wrote:
[...]
> I'm not sure why you're so eager to avoid implementing a replay
> callback for VT-d.  What happens when VT-d is not enabled by the
> guest?  vfio/pci.c calls pci_device_iommu_address_space() to get the
> address space for the device, which results in vtd_find_add_as() which
> gives us a unique VTDAddressSpace.as for that device.  With VT-d not in
> use by the guest, when do steps 3-5 occur?  I agree with your reasoning
> when VT-d is enabled, but the BIOS/UEFI does not enable VT-d, so are we
> going to exclude all use cases of an assigned device prior to the guest
> enabling VT-d?

I think I got the point. I failed to consider the case that we can run
IOMMU without enabling it, like BIOS (as you have mentioned), or we
can disable IOMMU in kernel boot parameters (though, iommu=pt should
still follow the case that IOMMU is enabled).

Sounds like a replay callback is a good idea. For Intel version of the
callback: when DMAR is enabled, we can just return directly. when DMAR
is disabled, we just do whatever we need to do region_add() for the
global address_space_memory.

> 
> On that same topic, I'm really dubious that we have the flexibility in
> our memory API to really support an IOMMU like VT-d and the layout of
> having a VTDAddressSpace per device, rather than exposing shared
> address spaces, has implications on the efficiency and locked memory
> requirements for vfio.  In the above case with VT-d disabled, the
> VTDAddressSpace should alias to the system memory AddressSpace and
> dynamically switch to a per device address space when VT-d is enabled.
> AFAICT, we don't have anything resembling that sort of feature, so we
> rely on the IOMMU driver to replay, perhaps even configuring its own
> MemoryListener to IOMMU notifier gateway, which is also something that
> doesn't currently exist.

It sounds more like a notifier for "IOMMU enablement"? The notifier
should be triggered when IOMMU switch between enabled <-> disabled?
When this happens, we rebuild the mapping in some way.

> 
> Additionally, if there are things unique to VT-d, for instance if VT-d
> is enabled and we can rely on the sequence of events you've set forth,
> then yes, the replay mechanism should do nothing.  But only the VT-d
> code can decide that, which implies that vfio always needs to call the
> replay function and a new MemoryRegionIOMMUOps callback needs to decide
> what to do given the current state of the vIOMMU.  Thanks,

Right. Thanks.

-- peterx
diff mbox

Patch

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 410f810..128ec7c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -43,6 +43,9 @@  static int vtd_dbgflags = VTD_DBGBIT(GENERAL) | VTD_DBGBIT(CSR);
 #define VTD_DPRINTF(what, fmt, ...) do {} while (0)
 #endif
 
+static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
+                                    uint8_t devfn, VTDContextEntry *ce);
+
 static void vtd_define_quad(IntelIOMMUState *s, hwaddr addr, uint64_t val,
                             uint64_t wmask, uint64_t w1cmask)
 {
@@ -126,6 +129,22 @@  static uint32_t vtd_set_clear_mask_long(IntelIOMMUState *s, hwaddr addr,
     return new_val;
 }
 
+static int vtd_get_did_dev(IntelIOMMUState *s, uint8_t bus_num, uint8_t devfn, uint16_t * domain_id)
+{
+    VTDContextEntry ce;
+    int ret_fr;
+
+    assert(domain_id);
+
+    ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce);
+    if (ret_fr){
+        return -1;
+    }
+
+    *domain_id =  VTD_CONTEXT_ENTRY_DID(ce.hi);
+    return 0;
+}
+
 static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
                                         uint64_t clear, uint64_t mask)
 {
@@ -724,9 +743,6 @@  static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     }
 
     if (!vtd_context_entry_present(ce)) {
-        VTD_DPRINTF(GENERAL,
-                    "error: context-entry #%"PRIu8 "(bus #%"PRIu8 ") "
-                    "is not present", devfn, bus_num);
         return -VTD_FR_CONTEXT_ENTRY_P;
     } else if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
                (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
@@ -1033,18 +1049,58 @@  static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
                                 &domain_id);
 }
 
+static void vtd_iotlb_page_invalidate_vfio(IntelIOMMUState *s, uint16_t domain_id,
+                                      hwaddr addr, uint8_t am)
+{
+    VFIOGuestIOMMU * giommu;
+
+    QLIST_FOREACH(giommu, &(s->giommu_list), iommu_next){
+        VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);
+        uint16_t vfio_domain_id; 
+        int ret = vtd_get_did_dev(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &vfio_domain_id);
+        int i=0;
+        if (!ret && domain_id == vfio_domain_id){
+            IOMMUTLBEntry entry; 
+            
+            /* do vfio unmap */
+            VTD_DPRINTF(GENERAL, "Remove addr 0x%"PRIx64 " mask %d", addr, am);
+            entry.target_as = NULL;
+            entry.iova = addr & VTD_PAGE_MASK_4K;
+            entry.translated_addr = 0;
+            entry.addr_mask = ~VTD_PAGE_MASK(VTD_PAGE_SHIFT_4K + am);
+            entry.perm = IOMMU_NONE;
+            memory_region_notify_iommu(giommu->iommu, entry);
+       
+            /* do vfio map */
+            VTD_DPRINTF(GENERAL, "add addr 0x%"PRIx64 " mask %d", addr, am);
+            /* call to vtd_iommu_translate */
+            for (i = 0; i < (1 << am); i++, addr+=(1 << VTD_PAGE_SHIFT_4K)){
+                IOMMUTLBEntry entry = s->iommu_ops.translate(giommu->iommu, addr, IOMMU_NO_FAIL); 
+                if (entry.perm != IOMMU_NONE){
+                    memory_region_notify_iommu(giommu->iommu, entry);
+                }
+            }
+        }
+    }
+}
+
 static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
                                       hwaddr addr, uint8_t am)
 {
     VTDIOTLBPageInvInfo info;
 
     assert(am <= VTD_MAMV);
+    
     info.domain_id = domain_id;
     info.addr = addr;
     info.mask = ~((1 << am) - 1);
+    
     g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+
+    vtd_iotlb_page_invalidate_vfio(s, domain_id, addr, am);
 }
 
+
 /* Flush IOTLB
  * Returns the IOTLB Actual Invalidation Granularity.
  * @val: the content of the IOTLB_REG
@@ -1912,6 +1968,13 @@  static Property vtd_properties[] = {
     DEFINE_PROP_END_OF_LIST(),
 };
 
+void vtd_register_giommu(VFIOGuestIOMMU * giommu)
+{
+    VTDAddressSpace *vtd_as = container_of(giommu->iommu, VTDAddressSpace, iommu);
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    
+    QLIST_INSERT_HEAD(&s->giommu_list, giommu, iommu_next);
+}
 
 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
 {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index ae40f73..102e9a5 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -339,6 +339,8 @@  typedef struct VTDIOTLBPageInvInfo VTDIOTLBPageInvInfo;
 #define VTD_PAGE_SHIFT_1G           30
 #define VTD_PAGE_MASK_1G            (~((1ULL << VTD_PAGE_SHIFT_1G) - 1))
 
+#define VTD_PAGE_MASK(shift)        (~((1ULL << (shift)) - 1))
+
 struct VTDRootEntry {
     uint64_t val;
     uint64_t rsvd;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 88154a1..54fc8bc 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -35,6 +35,9 @@ 
 #endif
 #include "trace.h"
 
+#include "hw/sysbus.h"
+#include "hw/i386/intel_iommu.h"
+
 struct vfio_group_head vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
 struct vfio_as_head vfio_address_spaces =
@@ -315,12 +318,12 @@  static void vfio_iommu_map_notify(Notifier *n, void *data)
 out:
     rcu_read_unlock();
 }
-
+#if 0
 static hwaddr vfio_container_granularity(VFIOContainer *container)
 {
     return (hwaddr)1 << ctz64(container->iova_pgsizes);
 }
-
+#endif
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -384,11 +387,13 @@  static void vfio_listener_region_add(MemoryListener *listener,
         giommu->n.notify = vfio_iommu_map_notify;
         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
 
+        vtd_register_giommu(giommu);
         memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
+#if 0
         memory_region_iommu_replay(giommu->iommu, &giommu->n,
                                    vfio_container_granularity(container),
                                    false);
-
+#endif
         return;
     }
 
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index b024ffa..22f3f83 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -23,6 +23,7 @@ 
 #define INTEL_IOMMU_H
 #include "hw/qdev.h"
 #include "sysemu/dma.h"
+#include "hw/vfio/vfio-common.h"
 
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@@ -123,6 +124,8 @@  struct IntelIOMMUState {
     MemoryRegionIOMMUOps iommu_ops;
     GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
     VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
+
+    QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
 };
 
 /* Find the VTD Address space associated with the given bus pointer,
@@ -130,4 +133,5 @@  struct IntelIOMMUState {
  */
 VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn);
 
+void vtd_register_giommu(VFIOGuestIOMMU * giommu);
 #endif
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index eb0e1b0..bf56a1d 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -92,6 +92,7 @@  typedef struct VFIOGuestIOMMU {
     MemoryRegion *iommu;
     Notifier n;
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
+    QLIST_ENTRY(VFIOGuestIOMMU) iommu_next;
 } VFIOGuestIOMMU;
 
 typedef struct VFIODeviceOps VFIODeviceOps;