diff mbox series

[08/10] intel-iommu: maintain per-device iova ranges

Message ID 20180425045129.17449-9-peterx@redhat.com
State New
Headers show
Series [01/10] intel-iommu: send PSI always even if across PDEs | expand

Commit Message

Peter Xu April 25, 2018, 4:51 a.m. UTC
For each VTDAddressSpace, now we maintain what IOVA ranges we have
mapped and what we have not.  With that information, now we only send
MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we know
we have already mapped the range, meanwhile we don't send UNMAP notifies
if we know we never mapped the range at all.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/hw/i386/intel_iommu.h |  2 ++
 hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
 hw/i386/trace-events          |  2 ++
 3 files changed, 32 insertions(+)

Comments

Jason Wang April 27, 2018, 6:07 a.m. UTC | #1
On 2018年04月25日 12:51, Peter Xu wrote:
> For each VTDAddressSpace, now we maintain what IOVA ranges we have
> mapped and what we have not.  With that information, now we only send
> MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we know
> we have already mapped the range, meanwhile we don't send UNMAP notifies
> if we know we never mapped the range at all.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>   include/hw/i386/intel_iommu.h |  2 ++
>   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
>   hw/i386/trace-events          |  2 ++
>   3 files changed, 32 insertions(+)
>
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index 486e205e79..09a2e94404 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -27,6 +27,7 @@
>   #include "hw/i386/ioapic.h"
>   #include "hw/pci/msi.h"
>   #include "hw/sysbus.h"
> +#include "qemu/interval-tree.h"
>   
>   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>   #define INTEL_IOMMU_DEVICE(obj) \
> @@ -95,6 +96,7 @@ struct VTDAddressSpace {
>       QLIST_ENTRY(VTDAddressSpace) next;
>       /* Superset of notifier flags that this address space has */
>       IOMMUNotifierFlag notifier_flags;
> +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
>   };
>   
>   struct VTDBus {
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index a19c18b8d4..8f396a5d13 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -768,12 +768,37 @@ typedef struct {
>   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
>                                vtd_page_walk_info *info)
>   {
> +    VTDAddressSpace *as = info->as;
>       vtd_page_walk_hook hook_fn = info->hook_fn;
>       void *private = info->private;
> +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> +                                   entry->iova + entry->addr_mask);
>   
>       assert(hook_fn);
> +
> +    /* Update local IOVA mapped ranges */
> +    if (entry->perm) {
> +        if (mapped) {
> +            /* Skip since we have already mapped this range */
> +            trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
> +                                             mapped->start, mapped->end);
> +            return 0;
> +        }
> +        it_tree_insert(as->iova_tree, entry->iova,
> +                       entry->iova + entry->addr_mask);

I was consider a case e.g:

1) map A (iova) to B (pa)
2) invalidate A
3) map A (iova) to C (pa)
4) invalidate A

In this case, we will probably miss a walk here. But I'm not sure it was 
allowed by the spec (though I think so).

Thanks

> +    } else {
> +        if (!mapped) {
> +            /* Skip since we didn't map this range at all */
> +            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
> +            return 0;
> +        }
> +        it_tree_remove(as->iova_tree, entry->iova,
> +                       entry->iova + entry->addr_mask);
> +    }
> +
>       trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
>                               entry->addr_mask, entry->perm);
> +
>       return hook_fn(entry, private);
>   }
>   
> @@ -2798,6 +2823,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
>           vtd_dev_as->devfn = (uint8_t)devfn;
>           vtd_dev_as->iommu_state = s;
>           vtd_dev_as->context_cache_entry.context_cache_gen = 0;
> +        vtd_dev_as->iova_tree = it_tree_new();
>   
>           /*
>            * Memory region relationships looks like (Address range shows
> @@ -2894,6 +2920,8 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
>                                VTD_PCI_FUNC(as->devfn),
>                                entry.iova, size);
>   
> +    it_tree_remove(as->iova_tree, entry.iova, entry.iova + entry.addr_mask);
> +
>       memory_region_notify_one(n, &entry);
>   }
>   
> diff --git a/hw/i386/trace-events b/hw/i386/trace-events
> index 22d44648af..677f83420d 100644
> --- a/hw/i386/trace-events
> +++ b/hw/i386/trace-events
> @@ -40,6 +40,8 @@ vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
>   vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
>   vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
>   vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
> +vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t start, uint64_t end) "iova 0x%"PRIx64" mask 0x%"PRIx64" start 0x%"PRIx64" end 0x%"PRIx64
> +vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
>   vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
>   vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
>   vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
Peter Xu April 27, 2018, 6:34 a.m. UTC | #2
On Fri, Apr 27, 2018 at 02:07:46PM +0800, Jason Wang wrote:
> 
> 
> On 2018年04月25日 12:51, Peter Xu wrote:
> > For each VTDAddressSpace, now we maintain what IOVA ranges we have
> > mapped and what we have not.  With that information, now we only send
> > MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we know
> > we have already mapped the range, meanwhile we don't send UNMAP notifies
> > if we know we never mapped the range at all.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >   include/hw/i386/intel_iommu.h |  2 ++
> >   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
> >   hw/i386/trace-events          |  2 ++
> >   3 files changed, 32 insertions(+)
> > 
> > diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> > index 486e205e79..09a2e94404 100644
> > --- a/include/hw/i386/intel_iommu.h
> > +++ b/include/hw/i386/intel_iommu.h
> > @@ -27,6 +27,7 @@
> >   #include "hw/i386/ioapic.h"
> >   #include "hw/pci/msi.h"
> >   #include "hw/sysbus.h"
> > +#include "qemu/interval-tree.h"
> >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> >   #define INTEL_IOMMU_DEVICE(obj) \
> > @@ -95,6 +96,7 @@ struct VTDAddressSpace {
> >       QLIST_ENTRY(VTDAddressSpace) next;
> >       /* Superset of notifier flags that this address space has */
> >       IOMMUNotifierFlag notifier_flags;
> > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
> >   };
> >   struct VTDBus {
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index a19c18b8d4..8f396a5d13 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -768,12 +768,37 @@ typedef struct {
> >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
> >                                vtd_page_walk_info *info)
> >   {
> > +    VTDAddressSpace *as = info->as;
> >       vtd_page_walk_hook hook_fn = info->hook_fn;
> >       void *private = info->private;
> > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> > +                                   entry->iova + entry->addr_mask);
> >       assert(hook_fn);
> > +
> > +    /* Update local IOVA mapped ranges */
> > +    if (entry->perm) {
> > +        if (mapped) {
> > +            /* Skip since we have already mapped this range */
> > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
> > +                                             mapped->start, mapped->end);
> > +            return 0;
> > +        }
> > +        it_tree_insert(as->iova_tree, entry->iova,
> > +                       entry->iova + entry->addr_mask);
> 
> I was consider a case e.g:
> 
> 1) map A (iova) to B (pa)
> 2) invalidate A

Here to be more explicit you mean guest sends a PSI, not really
invalidation of the mapping.

> 3) map A (iova) to C (pa)
> 4) invalidate A

Here too.

> 
> In this case, we will probably miss a walk here. But I'm not sure it was
> allowed by the spec (though I think so).

IMHO IOMMU page tables should not be modified by guest directly.  It
can be mapped, unmapped, but should not be modified directly.  I
suppose that's why Linux IOMMU API won't provide iommu_modify() but
only iommu_[un]map(), etc.. I don't know whether there is anything in
the spec, but AFAIU this can cause coherence issue on device side
since after step (1) device should be able to know the mapping
already, then modifying that mapping without UNMAP that on device side
will cause undefined behaviors.

Thanks,
Tian, Kevin April 27, 2018, 7:02 a.m. UTC | #3
> From: Jason Wang [mailto:jasowang@redhat.com]
> Sent: Friday, April 27, 2018 2:08 PM
> 
> On 2018年04月25日 12:51, Peter Xu wrote:
> > For each VTDAddressSpace, now we maintain what IOVA ranges we have
> > mapped and what we have not.  With that information, now we only
> send
> > MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
> know
> > we have already mapped the range, meanwhile we don't send UNMAP
> notifies
> > if we know we never mapped the range at all.
> >
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >   include/hw/i386/intel_iommu.h |  2 ++
> >   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
> >   hw/i386/trace-events          |  2 ++
> >   3 files changed, 32 insertions(+)
> >
> > diff --git a/include/hw/i386/intel_iommu.h
> b/include/hw/i386/intel_iommu.h
> > index 486e205e79..09a2e94404 100644
> > --- a/include/hw/i386/intel_iommu.h
> > +++ b/include/hw/i386/intel_iommu.h
> > @@ -27,6 +27,7 @@
> >   #include "hw/i386/ioapic.h"
> >   #include "hw/pci/msi.h"
> >   #include "hw/sysbus.h"
> > +#include "qemu/interval-tree.h"
> >
> >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> >   #define INTEL_IOMMU_DEVICE(obj) \
> > @@ -95,6 +96,7 @@ struct VTDAddressSpace {
> >       QLIST_ENTRY(VTDAddressSpace) next;
> >       /* Superset of notifier flags that this address space has */
> >       IOMMUNotifierFlag notifier_flags;
> > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
> >   };
> >
> >   struct VTDBus {
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index a19c18b8d4..8f396a5d13 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -768,12 +768,37 @@ typedef struct {
> >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
> >                                vtd_page_walk_info *info)
> >   {
> > +    VTDAddressSpace *as = info->as;
> >       vtd_page_walk_hook hook_fn = info->hook_fn;
> >       void *private = info->private;
> > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> > +                                   entry->iova + entry->addr_mask);
> >
> >       assert(hook_fn);
> > +
> > +    /* Update local IOVA mapped ranges */
> > +    if (entry->perm) {
> > +        if (mapped) {
> > +            /* Skip since we have already mapped this range */
> > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> >addr_mask,
> > +                                             mapped->start, mapped->end);
> > +            return 0;
> > +        }
> > +        it_tree_insert(as->iova_tree, entry->iova,
> > +                       entry->iova + entry->addr_mask);
> 
> I was consider a case e.g:
> 
> 1) map A (iova) to B (pa)
> 2) invalidate A
> 3) map A (iova) to C (pa)
> 4) invalidate A
> 
> In this case, we will probably miss a walk here. But I'm not sure it was
> allowed by the spec (though I think so).
> 

I thought it was wrong in a glimpse, but then changed my mind after
another thinking. As long as device driver can quiescent the device
to not access A (iova) within above window, then above sequence
has no problem since any stale mappings (A->B) added before step 4)
won't be used and then will get flushed after step 4). Regarding to
that actually the 1st invalidation can be skipped:

1) map A (iova) to B (pa)
2) driver programs device to use A
3) driver programs device to not use A
4) map A (iova) to C (pa)
	A->B may be still valid in IOTLB
5) invalidate A
6) driver programs device to use A

Of course above doesn't generate a sane IOMMU API framework,
just as Peter pointed out. But from hardware p.o.v it looks no
problem.

Thanks
Kevin
Peter Xu April 27, 2018, 7:28 a.m. UTC | #4
On Fri, Apr 27, 2018 at 07:02:14AM +0000, Tian, Kevin wrote:
> > From: Jason Wang [mailto:jasowang@redhat.com]
> > Sent: Friday, April 27, 2018 2:08 PM
> > 
> > On 2018年04月25日 12:51, Peter Xu wrote:
> > > For each VTDAddressSpace, now we maintain what IOVA ranges we have
> > > mapped and what we have not.  With that information, now we only
> > send
> > > MAP or UNMAP when necessary.  Say, we don't send MAP notifies if we
> > know
> > > we have already mapped the range, meanwhile we don't send UNMAP
> > notifies
> > > if we know we never mapped the range at all.
> > >
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >   include/hw/i386/intel_iommu.h |  2 ++
> > >   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
> > >   hw/i386/trace-events          |  2 ++
> > >   3 files changed, 32 insertions(+)
> > >
> > > diff --git a/include/hw/i386/intel_iommu.h
> > b/include/hw/i386/intel_iommu.h
> > > index 486e205e79..09a2e94404 100644
> > > --- a/include/hw/i386/intel_iommu.h
> > > +++ b/include/hw/i386/intel_iommu.h
> > > @@ -27,6 +27,7 @@
> > >   #include "hw/i386/ioapic.h"
> > >   #include "hw/pci/msi.h"
> > >   #include "hw/sysbus.h"
> > > +#include "qemu/interval-tree.h"
> > >
> > >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> > >   #define INTEL_IOMMU_DEVICE(obj) \
> > > @@ -95,6 +96,7 @@ struct VTDAddressSpace {
> > >       QLIST_ENTRY(VTDAddressSpace) next;
> > >       /* Superset of notifier flags that this address space has */
> > >       IOMMUNotifierFlag notifier_flags;
> > > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
> > >   };
> > >
> > >   struct VTDBus {
> > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > index a19c18b8d4..8f396a5d13 100644
> > > --- a/hw/i386/intel_iommu.c
> > > +++ b/hw/i386/intel_iommu.c
> > > @@ -768,12 +768,37 @@ typedef struct {
> > >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
> > >                                vtd_page_walk_info *info)
> > >   {
> > > +    VTDAddressSpace *as = info->as;
> > >       vtd_page_walk_hook hook_fn = info->hook_fn;
> > >       void *private = info->private;
> > > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> > > +                                   entry->iova + entry->addr_mask);
> > >
> > >       assert(hook_fn);
> > > +
> > > +    /* Update local IOVA mapped ranges */
> > > +    if (entry->perm) {
> > > +        if (mapped) {
> > > +            /* Skip since we have already mapped this range */
> > > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> > >addr_mask,
> > > +                                             mapped->start, mapped->end);
> > > +            return 0;
> > > +        }
> > > +        it_tree_insert(as->iova_tree, entry->iova,
> > > +                       entry->iova + entry->addr_mask);
> > 
> > I was consider a case e.g:
> > 
> > 1) map A (iova) to B (pa)
> > 2) invalidate A
> > 3) map A (iova) to C (pa)
> > 4) invalidate A
> > 
> > In this case, we will probably miss a walk here. But I'm not sure it was
> > allowed by the spec (though I think so).
> > 

Hi, Kevin,

Thanks for joining the discussion.

> 
> I thought it was wrong in a glimpse, but then changed my mind after
> another thinking. As long as device driver can quiescent the device
> to not access A (iova) within above window, then above sequence
> has no problem since any stale mappings (A->B) added before step 4)
> won't be used and then will get flushed after step 4). Regarding to
> that actually the 1st invalidation can be skipped:
> 
> 1) map A (iova) to B (pa)
> 2) driver programs device to use A
> 3) driver programs device to not use A
> 4) map A (iova) to C (pa)
> 	A->B may be still valid in IOTLB
> 5) invalidate A
> 6) driver programs device to use A

Note that IMHO this is a bit different from Jason's example, and it'll
be fine.  Current code should work well with this scenario since the
emulation code will not aware of the map A until step (5).  Then we'll
have the correct mapping.

While for Jason's example it's exactly the extra PSI that might cause
stale mappings (though again I think it's still problematic...).

Actually I think I can just fix up the code even if Jason's case
happens by unmapping that first then remap:

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 31e9b52452..2a9584f9d8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -778,13 +778,21 @@ static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
     /* Update local IOVA mapped ranges */
     if (entry->perm) {
         if (mapped) {
-            /* Skip since we have already mapped this range */
-            trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
-                                             mapped->start, mapped->end);
-            return 0;
+            int ret;
+            /* Cache the write permission */
+            IOMMUAccessFlags flags = entry->perm;
+
+            /* UNMAP the old first then remap.  No need to touch IOVA tree */
+            entry->perm = IOMMU_NONE;
+            ret = hook_fn(entry, private);
+            if (ret) {
+                return ret;
+            }
+            entry->perm = flags;
+        } else {
+            it_tree_insert(as->iova_tree, entry->iova,
+                           entry->iova + entry->addr_mask);
         }
-        it_tree_insert(as->iova_tree, entry->iova,
-                       entry->iova + entry->addr_mask);
     } else {
         if (!mapped) {
             /* Skip since we didn't map this range at all */

If we really think it necessary, I can squash this in, though this is
a bit ugly.  But I just want to confirm whether this would be anything
we want...

Thanks,
Tian, Kevin April 27, 2018, 7:44 a.m. UTC | #5
> From: Peter Xu [mailto:peterx@redhat.com]
> Sent: Friday, April 27, 2018 3:28 PM
> 
> On Fri, Apr 27, 2018 at 07:02:14AM +0000, Tian, Kevin wrote:
> > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > Sent: Friday, April 27, 2018 2:08 PM
> > >
> > > On 2018年04月25日 12:51, Peter Xu wrote:
> > > > For each VTDAddressSpace, now we maintain what IOVA ranges we
> have
> > > > mapped and what we have not.  With that information, now we only
> > > send
> > > > MAP or UNMAP when necessary.  Say, we don't send MAP notifies if
> we
> > > know
> > > > we have already mapped the range, meanwhile we don't send
> UNMAP
> > > notifies
> > > > if we know we never mapped the range at all.
> > > >
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > ---
> > > >   include/hw/i386/intel_iommu.h |  2 ++
> > > >   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
> > > >   hw/i386/trace-events          |  2 ++
> > > >   3 files changed, 32 insertions(+)
> > > >
> > > > diff --git a/include/hw/i386/intel_iommu.h
> > > b/include/hw/i386/intel_iommu.h
> > > > index 486e205e79..09a2e94404 100644
> > > > --- a/include/hw/i386/intel_iommu.h
> > > > +++ b/include/hw/i386/intel_iommu.h
> > > > @@ -27,6 +27,7 @@
> > > >   #include "hw/i386/ioapic.h"
> > > >   #include "hw/pci/msi.h"
> > > >   #include "hw/sysbus.h"
> > > > +#include "qemu/interval-tree.h"
> > > >
> > > >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> > > >   #define INTEL_IOMMU_DEVICE(obj) \
> > > > @@ -95,6 +96,7 @@ struct VTDAddressSpace {
> > > >       QLIST_ENTRY(VTDAddressSpace) next;
> > > >       /* Superset of notifier flags that this address space has */
> > > >       IOMMUNotifierFlag notifier_flags;
> > > > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
> > > >   };
> > > >
> > > >   struct VTDBus {
> > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > index a19c18b8d4..8f396a5d13 100644
> > > > --- a/hw/i386/intel_iommu.c
> > > > +++ b/hw/i386/intel_iommu.c
> > > > @@ -768,12 +768,37 @@ typedef struct {
> > > >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
> > > >                                vtd_page_walk_info *info)
> > > >   {
> > > > +    VTDAddressSpace *as = info->as;
> > > >       vtd_page_walk_hook hook_fn = info->hook_fn;
> > > >       void *private = info->private;
> > > > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> > > > +                                   entry->iova + entry->addr_mask);
> > > >
> > > >       assert(hook_fn);
> > > > +
> > > > +    /* Update local IOVA mapped ranges */
> > > > +    if (entry->perm) {
> > > > +        if (mapped) {
> > > > +            /* Skip since we have already mapped this range */
> > > > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> > > >addr_mask,
> > > > +                                             mapped->start, mapped->end);
> > > > +            return 0;
> > > > +        }
> > > > +        it_tree_insert(as->iova_tree, entry->iova,
> > > > +                       entry->iova + entry->addr_mask);
> > >
> > > I was consider a case e.g:
> > >
> > > 1) map A (iova) to B (pa)
> > > 2) invalidate A
> > > 3) map A (iova) to C (pa)
> > > 4) invalidate A
> > >
> > > In this case, we will probably miss a walk here. But I'm not sure it was
> > > allowed by the spec (though I think so).
> > >
> 
> Hi, Kevin,
> 
> Thanks for joining the discussion.
> 
> >
> > I thought it was wrong in a glimpse, but then changed my mind after
> > another thinking. As long as device driver can quiescent the device
> > to not access A (iova) within above window, then above sequence
> > has no problem since any stale mappings (A->B) added before step 4)
> > won't be used and then will get flushed after step 4). Regarding to
> > that actually the 1st invalidation can be skipped:
> >
> > 1) map A (iova) to B (pa)
> > 2) driver programs device to use A
> > 3) driver programs device to not use A
> > 4) map A (iova) to C (pa)
> > 	A->B may be still valid in IOTLB
> > 5) invalidate A
> > 6) driver programs device to use A
> 
> Note that IMHO this is a bit different from Jason's example, and it'll
> be fine.  Current code should work well with this scenario since the
> emulation code will not aware of the map A until step (5).  Then we'll
> have the correct mapping.

you are right. we still need the extra PSI otherwise the 1st mapping
is problematic for use. So back to Jason's example.

> 
> While for Jason's example it's exactly the extra PSI that might cause
> stale mappings (though again I think it's still problematic...).

problematic in software side (e.g. that way IOMMU core relies on
device drivers which conflict with the value of using IOMMU) but
it is OK from hardware p.o.v. btw the extra PSI itself doesn't cause
stale mapping. Instead it is device activity after that PSI may cause it.

> 
> Actually I think I can just fix up the code even if Jason's case
> happens by unmapping that first then remap:
> 
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index 31e9b52452..2a9584f9d8 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -778,13 +778,21 @@ static int vtd_page_walk_one(IOMMUTLBEntry
> *entry, int level,
>      /* Update local IOVA mapped ranges */
>      if (entry->perm) {
>          if (mapped) {
> -            /* Skip since we have already mapped this range */
> -            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> >addr_mask,
> -                                             mapped->start, mapped->end);
> -            return 0;
> +            int ret;
> +            /* Cache the write permission */
> +            IOMMUAccessFlags flags = entry->perm;
> +
> +            /* UNMAP the old first then remap.  No need to touch IOVA tree */
> +            entry->perm = IOMMU_NONE;
> +            ret = hook_fn(entry, private);
> +            if (ret) {
> +                return ret;
> +            }
> +            entry->perm = flags;
> +        } else {
> +            it_tree_insert(as->iova_tree, entry->iova,
> +                           entry->iova + entry->addr_mask);
>          }
> -        it_tree_insert(as->iova_tree, entry->iova,
> -                       entry->iova + entry->addr_mask);
>      } else {
>          if (!mapped) {
>              /* Skip since we didn't map this range at all */
> 
> If we really think it necessary, I can squash this in, though this is
> a bit ugly.  But I just want to confirm whether this would be anything
> we want...
> 

I didn’t look into your actual code yet. If others think above
change is OK then it's definitely good as we conform to hardware
behavior here. Otherwise if there is a way to detect such unusual 
usage and then adopt some action (say kill the VM), it's also fine 
since user knows he runs a bad OS which is not supported by 
Qemu. It's just not good if such situation is not handled, which 
leads to some undefined behavior which nobody knows the reason 
w/o hard debug into. :-)

Thanks
Kevin
Peter Xu April 27, 2018, 9:55 a.m. UTC | #6
On Fri, Apr 27, 2018 at 07:44:07AM +0000, Tian, Kevin wrote:
> > From: Peter Xu [mailto:peterx@redhat.com]
> > Sent: Friday, April 27, 2018 3:28 PM
> > 
> > On Fri, Apr 27, 2018 at 07:02:14AM +0000, Tian, Kevin wrote:
> > > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > > Sent: Friday, April 27, 2018 2:08 PM
> > > >
> > > > On 2018年04月25日 12:51, Peter Xu wrote:
> > > > > For each VTDAddressSpace, now we maintain what IOVA ranges we
> > have
> > > > > mapped and what we have not.  With that information, now we only
> > > > send
> > > > > MAP or UNMAP when necessary.  Say, we don't send MAP notifies if
> > we
> > > > know
> > > > > we have already mapped the range, meanwhile we don't send
> > UNMAP
> > > > notifies
> > > > > if we know we never mapped the range at all.
> > > > >
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > ---
> > > > >   include/hw/i386/intel_iommu.h |  2 ++
> > > > >   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
> > > > >   hw/i386/trace-events          |  2 ++
> > > > >   3 files changed, 32 insertions(+)
> > > > >
> > > > > diff --git a/include/hw/i386/intel_iommu.h
> > > > b/include/hw/i386/intel_iommu.h
> > > > > index 486e205e79..09a2e94404 100644
> > > > > --- a/include/hw/i386/intel_iommu.h
> > > > > +++ b/include/hw/i386/intel_iommu.h
> > > > > @@ -27,6 +27,7 @@
> > > > >   #include "hw/i386/ioapic.h"
> > > > >   #include "hw/pci/msi.h"
> > > > >   #include "hw/sysbus.h"
> > > > > +#include "qemu/interval-tree.h"
> > > > >
> > > > >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> > > > >   #define INTEL_IOMMU_DEVICE(obj) \
> > > > > @@ -95,6 +96,7 @@ struct VTDAddressSpace {
> > > > >       QLIST_ENTRY(VTDAddressSpace) next;
> > > > >       /* Superset of notifier flags that this address space has */
> > > > >       IOMMUNotifierFlag notifier_flags;
> > > > > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
> > > > >   };
> > > > >
> > > > >   struct VTDBus {
> > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > > index a19c18b8d4..8f396a5d13 100644
> > > > > --- a/hw/i386/intel_iommu.c
> > > > > +++ b/hw/i386/intel_iommu.c
> > > > > @@ -768,12 +768,37 @@ typedef struct {
> > > > >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
> > > > >                                vtd_page_walk_info *info)
> > > > >   {
> > > > > +    VTDAddressSpace *as = info->as;
> > > > >       vtd_page_walk_hook hook_fn = info->hook_fn;
> > > > >       void *private = info->private;
> > > > > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> > > > > +                                   entry->iova + entry->addr_mask);
> > > > >
> > > > >       assert(hook_fn);
> > > > > +
> > > > > +    /* Update local IOVA mapped ranges */
> > > > > +    if (entry->perm) {
> > > > > +        if (mapped) {
> > > > > +            /* Skip since we have already mapped this range */
> > > > > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> > > > >addr_mask,
> > > > > +                                             mapped->start, mapped->end);
> > > > > +            return 0;
> > > > > +        }
> > > > > +        it_tree_insert(as->iova_tree, entry->iova,
> > > > > +                       entry->iova + entry->addr_mask);
> > > >
> > > > I was consider a case e.g:
> > > >
> > > > 1) map A (iova) to B (pa)
> > > > 2) invalidate A
> > > > 3) map A (iova) to C (pa)
> > > > 4) invalidate A
> > > >
> > > > In this case, we will probably miss a walk here. But I'm not sure it was
> > > > allowed by the spec (though I think so).
> > > >
> > 
> > Hi, Kevin,
> > 
> > Thanks for joining the discussion.
> > 
> > >
> > > I thought it was wrong in a glimpse, but then changed my mind after
> > > another thinking. As long as device driver can quiescent the device
> > > to not access A (iova) within above window, then above sequence
> > > has no problem since any stale mappings (A->B) added before step 4)
> > > won't be used and then will get flushed after step 4). Regarding to
> > > that actually the 1st invalidation can be skipped:
> > >
> > > 1) map A (iova) to B (pa)
> > > 2) driver programs device to use A
> > > 3) driver programs device to not use A
> > > 4) map A (iova) to C (pa)
> > > 	A->B may be still valid in IOTLB
> > > 5) invalidate A
> > > 6) driver programs device to use A
> > 
> > Note that IMHO this is a bit different from Jason's example, and it'll
> > be fine.  Current code should work well with this scenario since the
> > emulation code will not aware of the map A until step (5).  Then we'll
> > have the correct mapping.
> 
> you are right. we still need the extra PSI otherwise the 1st mapping
> is problematic for use. So back to Jason's example.
> 
> > 
> > While for Jason's example it's exactly the extra PSI that might cause
> > stale mappings (though again I think it's still problematic...).
> 
> problematic in software side (e.g. that way IOMMU core relies on
> device drivers which conflict with the value of using IOMMU) but
> it is OK from hardware p.o.v. btw the extra PSI itself doesn't cause
> stale mapping. Instead it is device activity after that PSI may cause it.
> 
> > 
> > Actually I think I can just fix up the code even if Jason's case
> > happens by unmapping that first then remap:
> > 
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 31e9b52452..2a9584f9d8 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -778,13 +778,21 @@ static int vtd_page_walk_one(IOMMUTLBEntry
> > *entry, int level,
> >      /* Update local IOVA mapped ranges */
> >      if (entry->perm) {
> >          if (mapped) {
> > -            /* Skip since we have already mapped this range */
> > -            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> > >addr_mask,
> > -                                             mapped->start, mapped->end);
> > -            return 0;
> > +            int ret;
> > +            /* Cache the write permission */
> > +            IOMMUAccessFlags flags = entry->perm;
> > +
> > +            /* UNMAP the old first then remap.  No need to touch IOVA tree */
> > +            entry->perm = IOMMU_NONE;
> > +            ret = hook_fn(entry, private);
> > +            if (ret) {
> > +                return ret;
> > +            }
> > +            entry->perm = flags;
> > +        } else {
> > +            it_tree_insert(as->iova_tree, entry->iova,
> > +                           entry->iova + entry->addr_mask);
> >          }
> > -        it_tree_insert(as->iova_tree, entry->iova,
> > -                       entry->iova + entry->addr_mask);
> >      } else {
> >          if (!mapped) {
> >              /* Skip since we didn't map this range at all */
> > 
> > If we really think it necessary, I can squash this in, though this is
> > a bit ugly.  But I just want to confirm whether this would be anything
> > we want...
> > 
> 
> I didn’t look into your actual code yet. If others think above
> change is OK then it's definitely good as we conform to hardware
> behavior here. Otherwise if there is a way to detect such unusual 
> usage and then adopt some action (say kill the VM), it's also fine 
> since user knows he runs a bad OS which is not supported by 
> Qemu. It's just not good if such situation is not handled, which 
> leads to some undefined behavior which nobody knows the reason 
> w/o hard debug into. :-)

Yeah, then let me do this. :)

Jason, would you be good with above change squashed?

Thanks,
Peter Xu April 27, 2018, 11:40 a.m. UTC | #7
On Fri, Apr 27, 2018 at 05:55:27PM +0800, Peter Xu wrote:
> On Fri, Apr 27, 2018 at 07:44:07AM +0000, Tian, Kevin wrote:
> > > From: Peter Xu [mailto:peterx@redhat.com]
> > > Sent: Friday, April 27, 2018 3:28 PM
> > > 
> > > On Fri, Apr 27, 2018 at 07:02:14AM +0000, Tian, Kevin wrote:
> > > > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > > > Sent: Friday, April 27, 2018 2:08 PM
> > > > >
> > > > > On 2018年04月25日 12:51, Peter Xu wrote:
> > > > > > For each VTDAddressSpace, now we maintain what IOVA ranges we
> > > have
> > > > > > mapped and what we have not.  With that information, now we only
> > > > > send
> > > > > > MAP or UNMAP when necessary.  Say, we don't send MAP notifies if
> > > we
> > > > > know
> > > > > > we have already mapped the range, meanwhile we don't send
> > > UNMAP
> > > > > notifies
> > > > > > if we know we never mapped the range at all.
> > > > > >
> > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > ---
> > > > > >   include/hw/i386/intel_iommu.h |  2 ++
> > > > > >   hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
> > > > > >   hw/i386/trace-events          |  2 ++
> > > > > >   3 files changed, 32 insertions(+)
> > > > > >
> > > > > > diff --git a/include/hw/i386/intel_iommu.h
> > > > > b/include/hw/i386/intel_iommu.h
> > > > > > index 486e205e79..09a2e94404 100644
> > > > > > --- a/include/hw/i386/intel_iommu.h
> > > > > > +++ b/include/hw/i386/intel_iommu.h
> > > > > > @@ -27,6 +27,7 @@
> > > > > >   #include "hw/i386/ioapic.h"
> > > > > >   #include "hw/pci/msi.h"
> > > > > >   #include "hw/sysbus.h"
> > > > > > +#include "qemu/interval-tree.h"
> > > > > >
> > > > > >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> > > > > >   #define INTEL_IOMMU_DEVICE(obj) \
> > > > > > @@ -95,6 +96,7 @@ struct VTDAddressSpace {
> > > > > >       QLIST_ENTRY(VTDAddressSpace) next;
> > > > > >       /* Superset of notifier flags that this address space has */
> > > > > >       IOMMUNotifierFlag notifier_flags;
> > > > > > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
> > > > > >   };
> > > > > >
> > > > > >   struct VTDBus {
> > > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > > > index a19c18b8d4..8f396a5d13 100644
> > > > > > --- a/hw/i386/intel_iommu.c
> > > > > > +++ b/hw/i386/intel_iommu.c
> > > > > > @@ -768,12 +768,37 @@ typedef struct {
> > > > > >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
> > > > > >                                vtd_page_walk_info *info)
> > > > > >   {
> > > > > > +    VTDAddressSpace *as = info->as;
> > > > > >       vtd_page_walk_hook hook_fn = info->hook_fn;
> > > > > >       void *private = info->private;
> > > > > > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> > > > > > +                                   entry->iova + entry->addr_mask);
> > > > > >
> > > > > >       assert(hook_fn);
> > > > > > +
> > > > > > +    /* Update local IOVA mapped ranges */
> > > > > > +    if (entry->perm) {
> > > > > > +        if (mapped) {
> > > > > > +            /* Skip since we have already mapped this range */
> > > > > > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> > > > > >addr_mask,
> > > > > > +                                             mapped->start, mapped->end);
> > > > > > +            return 0;
> > > > > > +        }
> > > > > > +        it_tree_insert(as->iova_tree, entry->iova,
> > > > > > +                       entry->iova + entry->addr_mask);
> > > > >
> > > > > I was consider a case e.g:
> > > > >
> > > > > 1) map A (iova) to B (pa)
> > > > > 2) invalidate A
> > > > > 3) map A (iova) to C (pa)
> > > > > 4) invalidate A
> > > > >
> > > > > In this case, we will probably miss a walk here. But I'm not sure it was
> > > > > allowed by the spec (though I think so).
> > > > >
> > > 
> > > Hi, Kevin,
> > > 
> > > Thanks for joining the discussion.
> > > 
> > > >
> > > > I thought it was wrong in a glimpse, but then changed my mind after
> > > > another thinking. As long as device driver can quiescent the device
> > > > to not access A (iova) within above window, then above sequence
> > > > has no problem since any stale mappings (A->B) added before step 4)
> > > > won't be used and then will get flushed after step 4). Regarding to
> > > > that actually the 1st invalidation can be skipped:
> > > >
> > > > 1) map A (iova) to B (pa)
> > > > 2) driver programs device to use A
> > > > 3) driver programs device to not use A
> > > > 4) map A (iova) to C (pa)
> > > > 	A->B may be still valid in IOTLB
> > > > 5) invalidate A
> > > > 6) driver programs device to use A
> > > 
> > > Note that IMHO this is a bit different from Jason's example, and it'll
> > > be fine.  Current code should work well with this scenario since the
> > > emulation code will not aware of the map A until step (5).  Then we'll
> > > have the correct mapping.
> > 
> > you are right. we still need the extra PSI otherwise the 1st mapping
> > is problematic for use. So back to Jason's example.
> > 
> > > 
> > > While for Jason's example it's exactly the extra PSI that might cause
> > > stale mappings (though again I think it's still problematic...).
> > 
> > problematic in software side (e.g. that way IOMMU core relies on
> > device drivers which conflict with the value of using IOMMU) but
> > it is OK from hardware p.o.v. btw the extra PSI itself doesn't cause
> > stale mapping. Instead it is device activity after that PSI may cause it.
> > 
> > > 
> > > Actually I think I can just fix up the code even if Jason's case
> > > happens by unmapping that first then remap:
> > > 
> > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > index 31e9b52452..2a9584f9d8 100644
> > > --- a/hw/i386/intel_iommu.c
> > > +++ b/hw/i386/intel_iommu.c
> > > @@ -778,13 +778,21 @@ static int vtd_page_walk_one(IOMMUTLBEntry
> > > *entry, int level,
> > >      /* Update local IOVA mapped ranges */
> > >      if (entry->perm) {
> > >          if (mapped) {
> > > -            /* Skip since we have already mapped this range */
> > > -            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> > > >addr_mask,
> > > -                                             mapped->start, mapped->end);
> > > -            return 0;
> > > +            int ret;
> > > +            /* Cache the write permission */
> > > +            IOMMUAccessFlags flags = entry->perm;
> > > +
> > > +            /* UNMAP the old first then remap.  No need to touch IOVA tree */
> > > +            entry->perm = IOMMU_NONE;
> > > +            ret = hook_fn(entry, private);
> > > +            if (ret) {
> > > +                return ret;
> > > +            }
> > > +            entry->perm = flags;
> > > +        } else {
> > > +            it_tree_insert(as->iova_tree, entry->iova,
> > > +                           entry->iova + entry->addr_mask);
> > >          }
> > > -        it_tree_insert(as->iova_tree, entry->iova,
> > > -                       entry->iova + entry->addr_mask);
> > >      } else {
> > >          if (!mapped) {
> > >              /* Skip since we didn't map this range at all */
> > > 
> > > If we really think it necessary, I can squash this in, though this is
> > > a bit ugly.  But I just want to confirm whether this would be anything
> > > we want...
> > > 
> > 
> > I didn’t look into your actual code yet. If others think above
> > change is OK then it's definitely good as we conform to hardware
> > behavior here. Otherwise if there is a way to detect such unusual 
> > usage and then adopt some action (say kill the VM), it's also fine 
> > since user knows he runs a bad OS which is not supported by 
> > Qemu. It's just not good if such situation is not handled, which 
> > leads to some undefined behavior which nobody knows the reason 
> > w/o hard debug into. :-)
> 
> Yeah, then let me do this. :)
> 
> Jason, would you be good with above change squashed?

Self NAK on this...

More than half of the whole series tries to solve the solo problem
that we unmapped some pages that were already mapped, which proved to
be wrong.  Now if we squash the change we will do the same wrong
thing, so we'll still have a very small window that the remapped page
be missing from a device's POV.

Now to solve this I suppose we'll need to cache every translation then
we can know whether a mapping has changed, and we only remap when it
really has changed.  But I'm afraid that can be a big amount of data
for nested guests.  For a most common 4G L2 guest, I think the worst
case (e.g., no huge page at all, no continuous pages) is 4G/4K=1M
entries in that tree.

Is it really worth it to solve this possibly-buggy-guest-OS problem
with such a overhead?  I don't know..

I'm not sure whether it's still acceptable that we put this issue
aside.  We should know that normal OSs should not do this, and if they
do, IMHO it proves a buggy OS already (so even from hardware POV we
allow this, from software POV it should still be problematic), then
it'll have problem for sure, but only within the VM itself, and it
won't affect other VMs or the host.  That sounds still reasonable to
me so far.

Of course I'm always willing to listen to more advice on this.

Thanks,
Tian, Kevin April 27, 2018, 11:37 p.m. UTC | #8
> From: Peter Xu [mailto:peterx@redhat.com]
> Sent: Friday, April 27, 2018 7:40 PM
> 
> On Fri, Apr 27, 2018 at 05:55:27PM +0800, Peter Xu wrote:
> > On Fri, Apr 27, 2018 at 07:44:07AM +0000, Tian, Kevin wrote:
> > > > From: Peter Xu [mailto:peterx@redhat.com]
> > > > Sent: Friday, April 27, 2018 3:28 PM
> > > >
> > > > On Fri, Apr 27, 2018 at 07:02:14AM +0000, Tian, Kevin wrote:
> > > > > > From: Jason Wang [mailto:jasowang@redhat.com]
> > > > > > Sent: Friday, April 27, 2018 2:08 PM
> > > > > >
> > > > > > On 2018年04月25日 12:51, Peter Xu wrote:
> > > > > > > For each VTDAddressSpace, now we maintain what IOVA ranges
> we
> > > > have
> > > > > > > mapped and what we have not.  With that information, now we
> only
> > > > > > send
> > > > > > > MAP or UNMAP when necessary.  Say, we don't send MAP
> notifies if
> > > > we
> > > > > > know
> > > > > > > we have already mapped the range, meanwhile we don't send
> > > > UNMAP
> > > > > > notifies
> > > > > > > if we know we never mapped the range at all.
> > > > > > >
> > > > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > > > ---
> > > > > > >   include/hw/i386/intel_iommu.h |  2 ++
> > > > > > >   hw/i386/intel_iommu.c         | 28
> ++++++++++++++++++++++++++++
> > > > > > >   hw/i386/trace-events          |  2 ++
> > > > > > >   3 files changed, 32 insertions(+)
> > > > > > >
> > > > > > > diff --git a/include/hw/i386/intel_iommu.h
> > > > > > b/include/hw/i386/intel_iommu.h
> > > > > > > index 486e205e79..09a2e94404 100644
> > > > > > > --- a/include/hw/i386/intel_iommu.h
> > > > > > > +++ b/include/hw/i386/intel_iommu.h
> > > > > > > @@ -27,6 +27,7 @@
> > > > > > >   #include "hw/i386/ioapic.h"
> > > > > > >   #include "hw/pci/msi.h"
> > > > > > >   #include "hw/sysbus.h"
> > > > > > > +#include "qemu/interval-tree.h"
> > > > > > >
> > > > > > >   #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
> > > > > > >   #define INTEL_IOMMU_DEVICE(obj) \
> > > > > > > @@ -95,6 +96,7 @@ struct VTDAddressSpace {
> > > > > > >       QLIST_ENTRY(VTDAddressSpace) next;
> > > > > > >       /* Superset of notifier flags that this address space has */
> > > > > > >       IOMMUNotifierFlag notifier_flags;
> > > > > > > +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
> > > > > > >   };
> > > > > > >
> > > > > > >   struct VTDBus {
> > > > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > > > > index a19c18b8d4..8f396a5d13 100644
> > > > > > > --- a/hw/i386/intel_iommu.c
> > > > > > > +++ b/hw/i386/intel_iommu.c
> > > > > > > @@ -768,12 +768,37 @@ typedef struct {
> > > > > > >   static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
> > > > > > >                                vtd_page_walk_info *info)
> > > > > > >   {
> > > > > > > +    VTDAddressSpace *as = info->as;
> > > > > > >       vtd_page_walk_hook hook_fn = info->hook_fn;
> > > > > > >       void *private = info->private;
> > > > > > > +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
> > > > > > > +                                   entry->iova + entry->addr_mask);
> > > > > > >
> > > > > > >       assert(hook_fn);
> > > > > > > +
> > > > > > > +    /* Update local IOVA mapped ranges */
> > > > > > > +    if (entry->perm) {
> > > > > > > +        if (mapped) {
> > > > > > > +            /* Skip since we have already mapped this range */
> > > > > > > +            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> > > > > > >addr_mask,
> > > > > > > +                                             mapped->start, mapped->end);
> > > > > > > +            return 0;
> > > > > > > +        }
> > > > > > > +        it_tree_insert(as->iova_tree, entry->iova,
> > > > > > > +                       entry->iova + entry->addr_mask);
> > > > > >
> > > > > > I was consider a case e.g:
> > > > > >
> > > > > > 1) map A (iova) to B (pa)
> > > > > > 2) invalidate A
> > > > > > 3) map A (iova) to C (pa)
> > > > > > 4) invalidate A
> > > > > >
> > > > > > In this case, we will probably miss a walk here. But I'm not sure it
> was
> > > > > > allowed by the spec (though I think so).
> > > > > >
> > > >
> > > > Hi, Kevin,
> > > >
> > > > Thanks for joining the discussion.
> > > >
> > > > >
> > > > > I thought it was wrong in a glimpse, but then changed my mind after
> > > > > another thinking. As long as device driver can quiescent the device
> > > > > to not access A (iova) within above window, then above sequence
> > > > > has no problem since any stale mappings (A->B) added before step 4)
> > > > > won't be used and then will get flushed after step 4). Regarding to
> > > > > that actually the 1st invalidation can be skipped:
> > > > >
> > > > > 1) map A (iova) to B (pa)
> > > > > 2) driver programs device to use A
> > > > > 3) driver programs device to not use A
> > > > > 4) map A (iova) to C (pa)
> > > > > 	A->B may be still valid in IOTLB
> > > > > 5) invalidate A
> > > > > 6) driver programs device to use A
> > > >
> > > > Note that IMHO this is a bit different from Jason's example, and it'll
> > > > be fine.  Current code should work well with this scenario since the
> > > > emulation code will not aware of the map A until step (5).  Then we'll
> > > > have the correct mapping.
> > >
> > > you are right. we still need the extra PSI otherwise the 1st mapping
> > > is problematic for use. So back to Jason's example.
> > >
> > > >
> > > > While for Jason's example it's exactly the extra PSI that might cause
> > > > stale mappings (though again I think it's still problematic...).
> > >
> > > problematic in software side (e.g. that way IOMMU core relies on
> > > device drivers which conflict with the value of using IOMMU) but
> > > it is OK from hardware p.o.v. btw the extra PSI itself doesn't cause
> > > stale mapping. Instead it is device activity after that PSI may cause it.
> > >
> > > >
> > > > Actually I think I can just fix up the code even if Jason's case
> > > > happens by unmapping that first then remap:
> > > >
> > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > > > index 31e9b52452..2a9584f9d8 100644
> > > > --- a/hw/i386/intel_iommu.c
> > > > +++ b/hw/i386/intel_iommu.c
> > > > @@ -778,13 +778,21 @@ static int
> vtd_page_walk_one(IOMMUTLBEntry
> > > > *entry, int level,
> > > >      /* Update local IOVA mapped ranges */
> > > >      if (entry->perm) {
> > > >          if (mapped) {
> > > > -            /* Skip since we have already mapped this range */
> > > > -            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
> > > > >addr_mask,
> > > > -                                             mapped->start, mapped->end);
> > > > -            return 0;
> > > > +            int ret;
> > > > +            /* Cache the write permission */
> > > > +            IOMMUAccessFlags flags = entry->perm;
> > > > +
> > > > +            /* UNMAP the old first then remap.  No need to touch IOVA
> tree */
> > > > +            entry->perm = IOMMU_NONE;
> > > > +            ret = hook_fn(entry, private);
> > > > +            if (ret) {
> > > > +                return ret;
> > > > +            }
> > > > +            entry->perm = flags;
> > > > +        } else {
> > > > +            it_tree_insert(as->iova_tree, entry->iova,
> > > > +                           entry->iova + entry->addr_mask);
> > > >          }
> > > > -        it_tree_insert(as->iova_tree, entry->iova,
> > > > -                       entry->iova + entry->addr_mask);
> > > >      } else {
> > > >          if (!mapped) {
> > > >              /* Skip since we didn't map this range at all */
> > > >
> > > > If we really think it necessary, I can squash this in, though this is
> > > > a bit ugly.  But I just want to confirm whether this would be anything
> > > > we want...
> > > >
> > >
> > > I didn’t look into your actual code yet. If others think above
> > > change is OK then it's definitely good as we conform to hardware
> > > behavior here. Otherwise if there is a way to detect such unusual
> > > usage and then adopt some action (say kill the VM), it's also fine
> > > since user knows he runs a bad OS which is not supported by
> > > Qemu. It's just not good if such situation is not handled, which
> > > leads to some undefined behavior which nobody knows the reason
> > > w/o hard debug into. :-)
> >
> > Yeah, then let me do this. :)
> >
> > Jason, would you be good with above change squashed?
> 
> Self NAK on this...
> 
> More than half of the whole series tries to solve the solo problem
> that we unmapped some pages that were already mapped, which proved
> to
> be wrong.  Now if we squash the change we will do the same wrong
> thing, so we'll still have a very small window that the remapped page
> be missing from a device's POV.
> 
> Now to solve this I suppose we'll need to cache every translation then
> we can know whether a mapping has changed, and we only remap when it
> really has changed.  But I'm afraid that can be a big amount of data
> for nested guests.  For a most common 4G L2 guest, I think the worst
> case (e.g., no huge page at all, no continuous pages) is 4G/4K=1M
> entries in that tree.

I think one key factor to think about is the effect of PSI. From
VT-d spec, all internal caches (IOTLB entries, paging-structure
cache entries, etc.) associated with specified address range
must be flushed accordingly, i.e. no cache on stale mapping.
Now per-device iova ranges is new type of cache introduced
by Qemu VT-d. It doesn't cache actual mapping but its purpose
is to decide whether to notify VFIO for updated mapping. In
this manner if we don't differentiate whether an entry is
for stale mapping, looks the definition of PSI is broken.

ask another question. In reality how much improvement this
patch can bring, i.e. is it usual to see guest map on an already
mapped region, or unmap an already unmapped region?

> 
> Is it really worth it to solve this possibly-buggy-guest-OS problem
> with such a overhead?  I don't know..

If adding overhead removes the benefit of this patch, then 
definitely not a good thing.

> 
> I'm not sure whether it's still acceptable that we put this issue
> aside.  We should know that normal OSs should not do this, and if they
> do, IMHO it proves a buggy OS already (so even from hardware POV we
> allow this, from software POV it should still be problematic), then
> it'll have problem for sure, but only within the VM itself, and it
> won't affect other VMs or the host.  That sounds still reasonable to
> me so far.

As said earlier, what I'm worried is whether there is a way to
detect such case when your assumption is violated. usually
emulation can choose to not implement all the features which
are supported on the real device, but it's done in a way that
non-supported features/behaviors can be reported to guest
(based on spec definition) thus guest knows the expectation
from the emulated device...

> 
> Of course I'm always willing to listen to more advice on this.
> 
> Thanks,
> 
> --
> Peter Xu
Jason Wang April 28, 2018, 1:49 a.m. UTC | #9
On 2018年04月27日 19:40, Peter Xu wrote:
> On Fri, Apr 27, 2018 at 05:55:27PM +0800, Peter Xu wrote:
>> On Fri, Apr 27, 2018 at 07:44:07AM +0000, Tian, Kevin wrote:
>>>> From: Peter Xu [mailto:peterx@redhat.com]
>>>> Sent: Friday, April 27, 2018 3:28 PM
>>>>
>>>> On Fri, Apr 27, 2018 at 07:02:14AM +0000, Tian, Kevin wrote:
>>>>>> From: Jason Wang [mailto:jasowang@redhat.com]
>>>>>> Sent: Friday, April 27, 2018 2:08 PM
>>>>>>
>>>>>> On 2018年04月25日 12:51, Peter Xu wrote:
>>>>>>> For each VTDAddressSpace, now we maintain what IOVA ranges we
>>>> have
>>>>>>> mapped and what we have not.  With that information, now we only
>>>>>> send
>>>>>>> MAP or UNMAP when necessary.  Say, we don't send MAP notifies if
>>>> we
>>>>>> know
>>>>>>> we have already mapped the range, meanwhile we don't send
>>>> UNMAP
>>>>>> notifies
>>>>>>> if we know we never mapped the range at all.
>>>>>>>
>>>>>>> Signed-off-by: Peter Xu <peterx@redhat.com>
>>>>>>> ---
>>>>>>>    include/hw/i386/intel_iommu.h |  2 ++
>>>>>>>    hw/i386/intel_iommu.c         | 28 ++++++++++++++++++++++++++++
>>>>>>>    hw/i386/trace-events          |  2 ++
>>>>>>>    3 files changed, 32 insertions(+)
>>>>>>>
>>>>>>> diff --git a/include/hw/i386/intel_iommu.h
>>>>>> b/include/hw/i386/intel_iommu.h
>>>>>>> index 486e205e79..09a2e94404 100644
>>>>>>> --- a/include/hw/i386/intel_iommu.h
>>>>>>> +++ b/include/hw/i386/intel_iommu.h
>>>>>>> @@ -27,6 +27,7 @@
>>>>>>>    #include "hw/i386/ioapic.h"
>>>>>>>    #include "hw/pci/msi.h"
>>>>>>>    #include "hw/sysbus.h"
>>>>>>> +#include "qemu/interval-tree.h"
>>>>>>>
>>>>>>>    #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
>>>>>>>    #define INTEL_IOMMU_DEVICE(obj) \
>>>>>>> @@ -95,6 +96,7 @@ struct VTDAddressSpace {
>>>>>>>        QLIST_ENTRY(VTDAddressSpace) next;
>>>>>>>        /* Superset of notifier flags that this address space has */
>>>>>>>        IOMMUNotifierFlag notifier_flags;
>>>>>>> +    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
>>>>>>>    };
>>>>>>>
>>>>>>>    struct VTDBus {
>>>>>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>>>>>>> index a19c18b8d4..8f396a5d13 100644
>>>>>>> --- a/hw/i386/intel_iommu.c
>>>>>>> +++ b/hw/i386/intel_iommu.c
>>>>>>> @@ -768,12 +768,37 @@ typedef struct {
>>>>>>>    static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
>>>>>>>                                 vtd_page_walk_info *info)
>>>>>>>    {
>>>>>>> +    VTDAddressSpace *as = info->as;
>>>>>>>        vtd_page_walk_hook hook_fn = info->hook_fn;
>>>>>>>        void *private = info->private;
>>>>>>> +    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
>>>>>>> +                                   entry->iova + entry->addr_mask);
>>>>>>>
>>>>>>>        assert(hook_fn);
>>>>>>> +
>>>>>>> +    /* Update local IOVA mapped ranges */
>>>>>>> +    if (entry->perm) {
>>>>>>> +        if (mapped) {
>>>>>>> +            /* Skip since we have already mapped this range */
>>>>>>> +            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
>>>>>>> addr_mask,
>>>>>>> +                                             mapped->start, mapped->end);
>>>>>>> +            return 0;
>>>>>>> +        }
>>>>>>> +        it_tree_insert(as->iova_tree, entry->iova,
>>>>>>> +                       entry->iova + entry->addr_mask);
>>>>>> I was consider a case e.g:
>>>>>>
>>>>>> 1) map A (iova) to B (pa)
>>>>>> 2) invalidate A
>>>>>> 3) map A (iova) to C (pa)
>>>>>> 4) invalidate A
>>>>>>
>>>>>> In this case, we will probably miss a walk here. But I'm not sure it was
>>>>>> allowed by the spec (though I think so).
>>>>>>
>>>> Hi, Kevin,
>>>>
>>>> Thanks for joining the discussion.
>>>>
>>>>> I thought it was wrong in a glimpse, but then changed my mind after
>>>>> another thinking. As long as device driver can quiescent the device
>>>>> to not access A (iova) within above window, then above sequence
>>>>> has no problem since any stale mappings (A->B) added before step 4)
>>>>> won't be used and then will get flushed after step 4). Regarding to
>>>>> that actually the 1st invalidation can be skipped:
>>>>>
>>>>> 1) map A (iova) to B (pa)
>>>>> 2) driver programs device to use A
>>>>> 3) driver programs device to not use A
>>>>> 4) map A (iova) to C (pa)
>>>>> 	A->B may be still valid in IOTLB
>>>>> 5) invalidate A
>>>>> 6) driver programs device to use A
>>>> Note that IMHO this is a bit different from Jason's example, and it'll
>>>> be fine.  Current code should work well with this scenario since the
>>>> emulation code will not aware of the map A until step (5).  Then we'll
>>>> have the correct mapping.
>>> you are right. we still need the extra PSI otherwise the 1st mapping
>>> is problematic for use. So back to Jason's example.
>>>
>>>> While for Jason's example it's exactly the extra PSI that might cause
>>>> stale mappings (though again I think it's still problematic...).
>>> problematic in software side (e.g. that way IOMMU core relies on
>>> device drivers which conflict with the value of using IOMMU) but
>>> it is OK from hardware p.o.v. btw the extra PSI itself doesn't cause
>>> stale mapping. Instead it is device activity after that PSI may cause it.
>>>
>>>> Actually I think I can just fix up the code even if Jason's case
>>>> happens by unmapping that first then remap:
>>>>
>>>> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>>>> index 31e9b52452..2a9584f9d8 100644
>>>> --- a/hw/i386/intel_iommu.c
>>>> +++ b/hw/i386/intel_iommu.c
>>>> @@ -778,13 +778,21 @@ static int vtd_page_walk_one(IOMMUTLBEntry
>>>> *entry, int level,
>>>>       /* Update local IOVA mapped ranges */
>>>>       if (entry->perm) {
>>>>           if (mapped) {
>>>> -            /* Skip since we have already mapped this range */
>>>> -            trace_vtd_page_walk_one_skip_map(entry->iova, entry-
>>>>> addr_mask,
>>>> -                                             mapped->start, mapped->end);
>>>> -            return 0;
>>>> +            int ret;
>>>> +            /* Cache the write permission */
>>>> +            IOMMUAccessFlags flags = entry->perm;
>>>> +
>>>> +            /* UNMAP the old first then remap.  No need to touch IOVA tree */
>>>> +            entry->perm = IOMMU_NONE;
>>>> +            ret = hook_fn(entry, private);
>>>> +            if (ret) {
>>>> +                return ret;
>>>> +            }
>>>> +            entry->perm = flags;
>>>> +        } else {
>>>> +            it_tree_insert(as->iova_tree, entry->iova,
>>>> +                           entry->iova + entry->addr_mask);
>>>>           }
>>>> -        it_tree_insert(as->iova_tree, entry->iova,
>>>> -                       entry->iova + entry->addr_mask);
>>>>       } else {
>>>>           if (!mapped) {
>>>>               /* Skip since we didn't map this range at all */
>>>>
>>>> If we really think it necessary, I can squash this in, though this is
>>>> a bit ugly.  But I just want to confirm whether this would be anything
>>>> we want...
>>>>
>>> I didn’t look into your actual code yet. If others think above
>>> change is OK then it's definitely good as we conform to hardware
>>> behavior here. Otherwise if there is a way to detect such unusual
>>> usage and then adopt some action (say kill the VM), it's also fine
>>> since user knows he runs a bad OS which is not supported by
>>> Qemu. It's just not good if such situation is not handled, which
>>> leads to some undefined behavior which nobody knows the reason
>>> w/o hard debug into. :-)
>> Yeah, then let me do this. :)
>>
>> Jason, would you be good with above change squashed?
> Self NAK on this...
>
> More than half of the whole series tries to solve the solo problem
> that we unmapped some pages that were already mapped, which proved to
> be wrong.  Now if we squash the change we will do the same wrong
> thing, so we'll still have a very small window that the remapped page
> be missing from a device's POV.
>
> Now to solve this I suppose we'll need to cache every translation then
> we can know whether a mapping has changed, and we only remap when it
> really has changed.  But I'm afraid that can be a big amount of data
> for nested guests.  For a most common 4G L2 guest, I think the worst
> case (e.g., no huge page at all, no continuous pages) is 4G/4K=1M
> entries in that tree.
>
> Is it really worth it to solve this possibly-buggy-guest-OS problem
> with such a overhead?  I don't know..
>
> I'm not sure whether it's still acceptable that we put this issue
> aside.  We should know that normal OSs should not do this, and if they
> do, IMHO it proves a buggy OS already (so even from hardware POV we
> allow this, from software POV it should still be problematic), then
> it'll have problem for sure, but only within the VM itself, and it
> won't affect other VMs or the host.  That sounds still reasonable to
> me so far.
>
> Of course I'm always willing to listen to more advice on this.
>
> Thanks,
>

I think as qemu, we need to emulate exactly the behavior of hardware, 
otherwise there's will be bugs that is too hard to debug. Looking at the 
history of e1000 emulation code, we even try to emulate undocumented 
behavior of hardware to unbreak some non-popular operating systems.

Thanks
Peter Xu May 3, 2018, 6:04 a.m. UTC | #10
On Fri, Apr 27, 2018 at 11:37:24PM +0000, Tian, Kevin wrote:

[...]

> > Self NAK on this...
> > 
> > More than half of the whole series tries to solve the solo problem
> > that we unmapped some pages that were already mapped, which proved
> > to
> > be wrong.  Now if we squash the change we will do the same wrong
> > thing, so we'll still have a very small window that the remapped page
> > be missing from a device's POV.
> > 
> > Now to solve this I suppose we'll need to cache every translation then
> > we can know whether a mapping has changed, and we only remap when it
> > really has changed.  But I'm afraid that can be a big amount of data
> > for nested guests.  For a most common 4G L2 guest, I think the worst
> > case (e.g., no huge page at all, no continuous pages) is 4G/4K=1M
> > entries in that tree.
> 
> I think one key factor to think about is the effect of PSI. From
> VT-d spec, all internal caches (IOTLB entries, paging-structure
> cache entries, etc.) associated with specified address range
> must be flushed accordingly, i.e. no cache on stale mapping.
> Now per-device iova ranges is new type of cache introduced
> by Qemu VT-d. It doesn't cache actual mapping but its purpose
> is to decide whether to notify VFIO for updated mapping. In
> this manner if we don't differentiate whether an entry is
> for stale mapping, looks the definition of PSI is broken.
> 
> ask another question. In reality how much improvement this
> patch can bring, i.e. is it usual to see guest map on an already
> mapped region, or unmap an already unmapped region?

The funny thing is that there is actually no MAP/UNMAP flag for
PSI/DSI.  For either MAP/UNMAP, guest send one PSI for that range, or
even a DSI (Domain Selective Invalidations).

This patch will mostly be helpful not really for PSIs, but for DSI.
Please have a look on Issue (4) that I mentioned in the cover letter.
This patch is the core part to solve that DMA error problem.

An example is that we can get DSI for a domain that already have
existing mappings. In QEMU, we handle DSI as a whole-range PSI, so
before this patch we will unmap those already mapped pages then remap
all of them.  However we can't map the same page again, so we cache
what we have mapped here.

> 
> > 
> > Is it really worth it to solve this possibly-buggy-guest-OS problem
> > with such a overhead?  I don't know..
> 
> If adding overhead removes the benefit of this patch, then 
> definitely not a good thing.

For the problem that we are going to solve, this patch is not really a
beneficial one, but fixes a critical bug.  Again, please refer to the
issue (4) of the cover letter for that 3ms window problem.

> 
> > 
> > I'm not sure whether it's still acceptable that we put this issue
> > aside.  We should know that normal OSs should not do this, and if they
> > do, IMHO it proves a buggy OS already (so even from hardware POV we
> > allow this, from software POV it should still be problematic), then
> > it'll have problem for sure, but only within the VM itself, and it
> > won't affect other VMs or the host.  That sounds still reasonable to
> > me so far.
> 
> As said earlier, what I'm worried is whether there is a way to
> detect such case when your assumption is violated. usually
> emulation can choose to not implement all the features which
> are supported on the real device, but it's done in a way that
> non-supported features/behaviors can be reported to guest
> (based on spec definition) thus guest knows the expectation
> from the emulated device...

IMHO the guest can't really detect this, but it'll found that the
device is not working functionally if it's doing something like what
Jason has mentioned.

Actually now I have had an idea if we really want to live well even
with Jason's example: maybe we'll need to identify PSI/DSI.  For DSI,
we don't remap for mapped pages; for PSI, we unmap and remap the
mapped pages.  That'll complicate the stuff a bit, but it should
satisfy all the people.

Thanks,
Jason Wang May 3, 2018, 7:20 a.m. UTC | #11
On 2018年05月03日 14:04, Peter Xu wrote:
> IMHO the guest can't really detect this, but it'll found that the
> device is not working functionally if it's doing something like what
> Jason has mentioned.
>
> Actually now I have had an idea if we really want to live well even
> with Jason's example: maybe we'll need to identify PSI/DSI.  For DSI,
> we don't remap for mapped pages; for PSI, we unmap and remap the
> mapped pages.  That'll complicate the stuff a bit, but it should
> satisfy all the people.
>
> Thanks,

So it looks like there will be still unnecessary unamps. How about 
record the mappings in the tree too?

Thanks
Peter Xu May 3, 2018, 7:28 a.m. UTC | #12
On Thu, May 03, 2018 at 03:20:11PM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月03日 14:04, Peter Xu wrote:
> > IMHO the guest can't really detect this, but it'll found that the
> > device is not working functionally if it's doing something like what
> > Jason has mentioned.
> > 
> > Actually now I have had an idea if we really want to live well even
> > with Jason's example: maybe we'll need to identify PSI/DSI.  For DSI,
> > we don't remap for mapped pages; for PSI, we unmap and remap the
> > mapped pages.  That'll complicate the stuff a bit, but it should
> > satisfy all the people.
> > 
> > Thanks,
> 
> So it looks like there will be still unnecessary unamps.

Could I ask what do you mean by "unecessary unmaps"?

> How about record the mappings in the tree too?

As I mentioned, for L1 guest (e.g., DPDK applications running in L1)
it'll be fine.  But I'm just afraid we will have other use cases, like
the L2 guests. That might need tons of the mapping entries in the
worst case scenario.
Jason Wang May 3, 2018, 7:43 a.m. UTC | #13
On 2018年05月03日 15:28, Peter Xu wrote:
> On Thu, May 03, 2018 at 03:20:11PM +0800, Jason Wang wrote:
>>
>> On 2018年05月03日 14:04, Peter Xu wrote:
>>> IMHO the guest can't really detect this, but it'll found that the
>>> device is not working functionally if it's doing something like what
>>> Jason has mentioned.
>>>
>>> Actually now I have had an idea if we really want to live well even
>>> with Jason's example: maybe we'll need to identify PSI/DSI.  For DSI,
>>> we don't remap for mapped pages; for PSI, we unmap and remap the
>>> mapped pages.  That'll complicate the stuff a bit, but it should
>>> satisfy all the people.
>>>
>>> Thanks,
>> So it looks like there will be still unnecessary unamps.
> Could I ask what do you mean by "unecessary unmaps"?

It's for "for PSI, we unmap and remap the mapped pages". So for the 
first "unmap" how do you know it was really necessary without knowing 
the state of current shadow page table?

>> How about record the mappings in the tree too?
> As I mentioned, for L1 guest (e.g., DPDK applications running in L1)
> it'll be fine.  But I'm just afraid we will have other use cases, like
> the L2 guests. That might need tons of the mapping entries in the
> worst case scenario.
>

Yes, but that's the price of shadow page tables.

Thanks
Peter Xu May 3, 2018, 7:53 a.m. UTC | #14
On Thu, May 03, 2018 at 03:43:35PM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月03日 15:28, Peter Xu wrote:
> > On Thu, May 03, 2018 at 03:20:11PM +0800, Jason Wang wrote:
> > > 
> > > On 2018年05月03日 14:04, Peter Xu wrote:
> > > > IMHO the guest can't really detect this, but it'll found that the
> > > > device is not working functionally if it's doing something like what
> > > > Jason has mentioned.
> > > > 
> > > > Actually now I have had an idea if we really want to live well even
> > > > with Jason's example: maybe we'll need to identify PSI/DSI.  For DSI,
> > > > we don't remap for mapped pages; for PSI, we unmap and remap the
> > > > mapped pages.  That'll complicate the stuff a bit, but it should
> > > > satisfy all the people.
> > > > 
> > > > Thanks,
> > > So it looks like there will be still unnecessary unamps.
> > Could I ask what do you mean by "unecessary unmaps"?
> 
> It's for "for PSI, we unmap and remap the mapped pages". So for the first
> "unmap" how do you know it was really necessary without knowing the state of
> current shadow page table?

I don't.  Could I just unmap it anyway?  Say, now the guest _modified_
the PTE already.  Yes I think it's following the spec, but it is
really _unsafe_.  We can know that from what it has done already.
Then I really think a unmap+map would be good enough for us...  After
all that behavior can cause DMA error even on real hardwares.  It can
never tell.

> 
> > > How about record the mappings in the tree too?
> > As I mentioned, for L1 guest (e.g., DPDK applications running in L1)
> > it'll be fine.  But I'm just afraid we will have other use cases, like
> > the L2 guests. That might need tons of the mapping entries in the
> > worst case scenario.
> > 
> 
> Yes, but that's the price of shadow page tables.

So that's why I would like to propose this mergable interval tree.  It
might greatly reduce the price if we can reach a consensus on how we
should treat those strange-behaved guest OSs.  Thanks,
Jason Wang May 3, 2018, 9:22 a.m. UTC | #15
On 2018年05月03日 15:53, Peter Xu wrote:
> On Thu, May 03, 2018 at 03:43:35PM +0800, Jason Wang wrote:
>>
>> On 2018年05月03日 15:28, Peter Xu wrote:
>>> On Thu, May 03, 2018 at 03:20:11PM +0800, Jason Wang wrote:
>>>> On 2018年05月03日 14:04, Peter Xu wrote:
>>>>> IMHO the guest can't really detect this, but it'll found that the
>>>>> device is not working functionally if it's doing something like what
>>>>> Jason has mentioned.
>>>>>
>>>>> Actually now I have had an idea if we really want to live well even
>>>>> with Jason's example: maybe we'll need to identify PSI/DSI.  For DSI,
>>>>> we don't remap for mapped pages; for PSI, we unmap and remap the
>>>>> mapped pages.  That'll complicate the stuff a bit, but it should
>>>>> satisfy all the people.
>>>>>
>>>>> Thanks,
>>>> So it looks like there will be still unnecessary unamps.
>>> Could I ask what do you mean by "unecessary unmaps"?
>> It's for "for PSI, we unmap and remap the mapped pages". So for the first
>> "unmap" how do you know it was really necessary without knowing the state of
>> current shadow page table?
> I don't.  Could I just unmap it anyway?  Say, now the guest _modified_
> the PTE already.  Yes I think it's following the spec, but it is
> really _unsafe_.  We can know that from what it has done already.
> Then I really think a unmap+map would be good enough for us...  After
> all that behavior can cause DMA error even on real hardwares.  It can
> never tell.

I mean for following case:

1) guest maps A1 (iova) to XXX
2) guest maps A2 (A1 + 4K) (iova) to YYY
3) guest maps A3 (A1 + 8K) (iova) to ZZZ
4) guest unmaps A2 and A2, for reducing the number of PSIs, it can 
invalidate A1 with a range of 2M

If this is allowed by spec, looks like A1 will be unmaped and mapped.

Thanks

>
>>>> How about record the mappings in the tree too?
>>> As I mentioned, for L1 guest (e.g., DPDK applications running in L1)
>>> it'll be fine.  But I'm just afraid we will have other use cases, like
>>> the L2 guests. That might need tons of the mapping entries in the
>>> worst case scenario.
>>>
>> Yes, but that's the price of shadow page tables.
> So that's why I would like to propose this mergable interval tree.  It
> might greatly reduce the price if we can reach a consensus on how we
> should treat those strange-behaved guest OSs.  Thanks,
>
Peter Xu May 3, 2018, 9:53 a.m. UTC | #16
On Thu, May 03, 2018 at 05:22:03PM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月03日 15:53, Peter Xu wrote:
> > On Thu, May 03, 2018 at 03:43:35PM +0800, Jason Wang wrote:
> > > 
> > > On 2018年05月03日 15:28, Peter Xu wrote:
> > > > On Thu, May 03, 2018 at 03:20:11PM +0800, Jason Wang wrote:
> > > > > On 2018年05月03日 14:04, Peter Xu wrote:
> > > > > > IMHO the guest can't really detect this, but it'll found that the
> > > > > > device is not working functionally if it's doing something like what
> > > > > > Jason has mentioned.
> > > > > > 
> > > > > > Actually now I have had an idea if we really want to live well even
> > > > > > with Jason's example: maybe we'll need to identify PSI/DSI.  For DSI,
> > > > > > we don't remap for mapped pages; for PSI, we unmap and remap the
> > > > > > mapped pages.  That'll complicate the stuff a bit, but it should
> > > > > > satisfy all the people.
> > > > > > 
> > > > > > Thanks,
> > > > > So it looks like there will be still unnecessary unamps.
> > > > Could I ask what do you mean by "unecessary unmaps"?
> > > It's for "for PSI, we unmap and remap the mapped pages". So for the first
> > > "unmap" how do you know it was really necessary without knowing the state of
> > > current shadow page table?
> > I don't.  Could I just unmap it anyway?  Say, now the guest _modified_
> > the PTE already.  Yes I think it's following the spec, but it is
> > really _unsafe_.  We can know that from what it has done already.
> > Then I really think a unmap+map would be good enough for us...  After
> > all that behavior can cause DMA error even on real hardwares.  It can
> > never tell.
> 
> I mean for following case:
> 
> 1) guest maps A1 (iova) to XXX
> 2) guest maps A2 (A1 + 4K) (iova) to YYY
> 3) guest maps A3 (A1 + 8K) (iova) to ZZZ
> 4) guest unmaps A2 and A2, for reducing the number of PSIs, it can
> invalidate A1 with a range of 2M
> 
> If this is allowed by spec, looks like A1 will be unmaped and mapped.

My follow-up patch won't survive with this one but the original patch
will work.

Jason and I discussed a bit on IRC on this matter.  Here's the
conclusion we got: for now we use my original patch (which solves
everything except PTE modifications).  We mark that modify-PTE problem
as TODO. Then at least we can have the nested device assignment work
well on known OSs first.
Peter Xu May 3, 2018, 12:01 p.m. UTC | #17
On Thu, May 03, 2018 at 05:53:59PM +0800, Peter Xu wrote:
> On Thu, May 03, 2018 at 05:22:03PM +0800, Jason Wang wrote:
> > 
> > 
> > On 2018年05月03日 15:53, Peter Xu wrote:
> > > On Thu, May 03, 2018 at 03:43:35PM +0800, Jason Wang wrote:
> > > > 
> > > > On 2018年05月03日 15:28, Peter Xu wrote:
> > > > > On Thu, May 03, 2018 at 03:20:11PM +0800, Jason Wang wrote:
> > > > > > On 2018年05月03日 14:04, Peter Xu wrote:
> > > > > > > IMHO the guest can't really detect this, but it'll found that the
> > > > > > > device is not working functionally if it's doing something like what
> > > > > > > Jason has mentioned.
> > > > > > > 
> > > > > > > Actually now I have had an idea if we really want to live well even
> > > > > > > with Jason's example: maybe we'll need to identify PSI/DSI.  For DSI,
> > > > > > > we don't remap for mapped pages; for PSI, we unmap and remap the
> > > > > > > mapped pages.  That'll complicate the stuff a bit, but it should
> > > > > > > satisfy all the people.
> > > > > > > 
> > > > > > > Thanks,
> > > > > > So it looks like there will be still unnecessary unamps.
> > > > > Could I ask what do you mean by "unecessary unmaps"?
> > > > It's for "for PSI, we unmap and remap the mapped pages". So for the first
> > > > "unmap" how do you know it was really necessary without knowing the state of
> > > > current shadow page table?
> > > I don't.  Could I just unmap it anyway?  Say, now the guest _modified_
> > > the PTE already.  Yes I think it's following the spec, but it is
> > > really _unsafe_.  We can know that from what it has done already.
> > > Then I really think a unmap+map would be good enough for us...  After
> > > all that behavior can cause DMA error even on real hardwares.  It can
> > > never tell.
> > 
> > I mean for following case:
> > 
> > 1) guest maps A1 (iova) to XXX
> > 2) guest maps A2 (A1 + 4K) (iova) to YYY
> > 3) guest maps A3 (A1 + 8K) (iova) to ZZZ
> > 4) guest unmaps A2 and A2, for reducing the number of PSIs, it can
> > invalidate A1 with a range of 2M
> > 
> > If this is allowed by spec, looks like A1 will be unmaped and mapped.
> 
> My follow-up patch won't survive with this one but the original patch
> will work.
> 
> Jason and I discussed a bit on IRC on this matter.  Here's the
> conclusion we got: for now we use my original patch (which solves
> everything except PTE modifications).  We mark that modify-PTE problem
> as TODO. Then at least we can have the nested device assignment work
> well on known OSs first.

Here just to mention that we actually have no way to emulate a PTE
modification procedure.  The problem is that we can never atomically
modify a PTE on the host with Linux, either via VFIO interface or even
directly using IOMMU API in kernel.  To be more specific to our use
case - VFIO provides VFIO_IOMMU_MAP_DMA and VFIO_IOMMU_UNMAP_DMA, but
it never provides VFIO_IOMMU_MODIFY_DMA to modify a PTE atomically.
It means that even if we know the PTE has changed, then we can only
unmap it and remap.  It'll still have the same "invalid window"
problem we have discussed since during unmap and remap the page is
invalid (while from guest POV it should never, since the PTE
modification is atomic).
diff mbox series

Patch

diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 486e205e79..09a2e94404 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -27,6 +27,7 @@ 
 #include "hw/i386/ioapic.h"
 #include "hw/pci/msi.h"
 #include "hw/sysbus.h"
+#include "qemu/interval-tree.h"
 
 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@@ -95,6 +96,7 @@  struct VTDAddressSpace {
     QLIST_ENTRY(VTDAddressSpace) next;
     /* Superset of notifier flags that this address space has */
     IOMMUNotifierFlag notifier_flags;
+    ITTree *iova_tree;          /* Traces mapped IOVA ranges */
 };
 
 struct VTDBus {
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a19c18b8d4..8f396a5d13 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -768,12 +768,37 @@  typedef struct {
 static int vtd_page_walk_one(IOMMUTLBEntry *entry, int level,
                              vtd_page_walk_info *info)
 {
+    VTDAddressSpace *as = info->as;
     vtd_page_walk_hook hook_fn = info->hook_fn;
     void *private = info->private;
+    ITRange *mapped = it_tree_find(as->iova_tree, entry->iova,
+                                   entry->iova + entry->addr_mask);
 
     assert(hook_fn);
+
+    /* Update local IOVA mapped ranges */
+    if (entry->perm) {
+        if (mapped) {
+            /* Skip since we have already mapped this range */
+            trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+                                             mapped->start, mapped->end);
+            return 0;
+        }
+        it_tree_insert(as->iova_tree, entry->iova,
+                       entry->iova + entry->addr_mask);
+    } else {
+        if (!mapped) {
+            /* Skip since we didn't map this range at all */
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+            return 0;
+        }
+        it_tree_remove(as->iova_tree, entry->iova,
+                       entry->iova + entry->addr_mask);
+    }
+
     trace_vtd_page_walk_one(level, entry->iova, entry->translated_addr,
                             entry->addr_mask, entry->perm);
+
     return hook_fn(entry, private);
 }
 
@@ -2798,6 +2823,7 @@  VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
         vtd_dev_as->devfn = (uint8_t)devfn;
         vtd_dev_as->iommu_state = s;
         vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+        vtd_dev_as->iova_tree = it_tree_new();
 
         /*
          * Memory region relationships looks like (Address range shows
@@ -2894,6 +2920,8 @@  static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
                              VTD_PCI_FUNC(as->devfn),
                              entry.iova, size);
 
+    it_tree_remove(as->iova_tree, entry.iova, entry.iova + entry.addr_mask);
+
     memory_region_notify_one(n, &entry);
 }
 
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 22d44648af..677f83420d 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -40,6 +40,8 @@  vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint6
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
 vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t start, uint64_t end) "iova 0x%"PRIx64" mask 0x%"PRIx64" start 0x%"PRIx64" end 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
 vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"