diff mbox series

[QEMU,v25,13/17] vfio: create mapped iova list when vIOMMU is enabled

Message ID 1592684486-18511-14-git-send-email-kwankhede@nvidia.com
State New
Headers show
Series Add migration support for VFIO devices | expand

Commit Message

Kirti Wankhede June 20, 2020, 8:21 p.m. UTC
Create mapped iova list when vIOMMU is enabled. For each mapped iova
save translated address. Add node to list on MAP and remove node from
list on UNMAP.
This list is used to track dirty pages during migration.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
---
 hw/vfio/common.c              | 58 ++++++++++++++++++++++++++++++++++++++-----
 include/hw/vfio/vfio-common.h |  8 ++++++
 2 files changed, 60 insertions(+), 6 deletions(-)

Comments

Alex Williamson June 24, 2020, 6:55 p.m. UTC | #1
On Sun, 21 Jun 2020 01:51:22 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Create mapped iova list when vIOMMU is enabled. For each mapped iova
> save translated address. Add node to list on MAP and remove node from
> list on UNMAP.
> This list is used to track dirty pages during migration.

This seems like a lot of overhead to support that the VM might migrate.
Is there no way we can build this when we start migration, for example
replaying the mappings at that time?  Thanks,

Alex

 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> ---
>  hw/vfio/common.c              | 58 ++++++++++++++++++++++++++++++++++++++-----
>  include/hw/vfio/vfio-common.h |  8 ++++++
>  2 files changed, 60 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index e0d3d4585a65..6921a78e9ba5 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -408,8 +408,8 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
>  }
>  
>  /* Called with rcu_read_lock held.  */
> -static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
> -                           bool *read_only)
> +static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> +                               ram_addr_t *ram_addr, bool *read_only)
>  {
>      MemoryRegion *mr;
>      hwaddr xlat;
> @@ -440,8 +440,17 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
>          return false;
>      }
>  
> -    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> -    *read_only = !writable || mr->readonly;
> +    if (vaddr) {
> +        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
> +    }
> +
> +    if (ram_addr) {
> +        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
> +    }
> +
> +    if (read_only) {
> +        *read_only = !writable || mr->readonly;
> +    }
>  
>      return true;
>  }
> @@ -451,7 +460,6 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
>      VFIOContainer *container = giommu->container;
>      hwaddr iova = iotlb->iova + giommu->iommu_offset;
> -    bool read_only;
>      void *vaddr;
>      int ret;
>  
> @@ -467,7 +475,10 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      rcu_read_lock();
>  
>      if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> -        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
> +        ram_addr_t ram_addr;
> +        bool read_only;
> +
> +        if (!vfio_get_xlat_addr(iotlb, &vaddr, &ram_addr, &read_only)) {
>              goto out;
>          }
>          /*
> @@ -485,8 +496,28 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>                           "0x%"HWADDR_PRIx", %p) = %d (%m)",
>                           container, iova,
>                           iotlb->addr_mask + 1, vaddr, ret);
> +        } else {
> +            VFIOIovaRange *iova_range;
> +
> +            iova_range = g_malloc0(sizeof(*iova_range));
> +            iova_range->iova = iova;
> +            iova_range->size = iotlb->addr_mask + 1;
> +            iova_range->ram_addr = ram_addr;
> +
> +            QLIST_INSERT_HEAD(&giommu->iova_list, iova_range, next);
>          }
>      } else {
> +        VFIOIovaRange *iova_range, *tmp;
> +
> +        QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
> +            if (iova_range->iova >= iova &&
> +                iova_range->iova + iova_range->size <= iova +
> +                                                       iotlb->addr_mask + 1) {
> +                QLIST_REMOVE(iova_range, next);
> +                g_free(iova_range);
> +            }
> +        }
> +
>          ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
>          if (ret) {
>              error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> @@ -643,6 +674,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
>              g_free(giommu);
>              goto fail;
>          }
> +        QLIST_INIT(&giommu->iova_list);
>          QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
>          memory_region_iommu_replay(giommu->iommu, &giommu->n);
>  
> @@ -741,6 +773,13 @@ static void vfio_listener_region_del(MemoryListener *listener,
>          QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
>              if (MEMORY_REGION(giommu->iommu) == section->mr &&
>                  giommu->n.start == section->offset_within_region) {
> +                VFIOIovaRange *iova_range, *tmp;
> +
> +                QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
> +                    QLIST_REMOVE(iova_range, next);
> +                    g_free(iova_range);
> +                }
> +
>                  memory_region_unregister_iommu_notifier(section->mr,
>                                                          &giommu->n);
>                  QLIST_REMOVE(giommu, giommu_next);
> @@ -1538,6 +1577,13 @@ static void vfio_disconnect_container(VFIOGroup *group)
>          QLIST_REMOVE(container, next);
>  
>          QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
> +            VFIOIovaRange *iova_range, *itmp;
> +
> +            QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, itmp) {
> +                QLIST_REMOVE(iova_range, next);
> +                g_free(iova_range);
> +            }
> +
>              memory_region_unregister_iommu_notifier(
>                      MEMORY_REGION(giommu->iommu), &giommu->n);
>              QLIST_REMOVE(giommu, giommu_next);
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 5a57a78ec517..56b75e4a8bc4 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -89,11 +89,19 @@ typedef struct VFIOContainer {
>      QLIST_ENTRY(VFIOContainer) next;
>  } VFIOContainer;
>  
> +typedef struct VFIOIovaRange {
> +    hwaddr iova;
> +    size_t size;
> +    ram_addr_t ram_addr;
> +    QLIST_ENTRY(VFIOIovaRange) next;
> +} VFIOIovaRange;
> +
>  typedef struct VFIOGuestIOMMU {
>      VFIOContainer *container;
>      IOMMUMemoryRegion *iommu;
>      hwaddr iommu_offset;
>      IOMMUNotifier n;
> +    QLIST_HEAD(, VFIOIovaRange) iova_list;
>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
>  } VFIOGuestIOMMU;
>
Kirti Wankhede June 25, 2020, 2:34 p.m. UTC | #2
On 6/25/2020 12:25 AM, Alex Williamson wrote:
> On Sun, 21 Jun 2020 01:51:22 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> Create mapped iova list when vIOMMU is enabled. For each mapped iova
>> save translated address. Add node to list on MAP and remove node from
>> list on UNMAP.
>> This list is used to track dirty pages during migration.
> 
> This seems like a lot of overhead to support that the VM might migrate.
> Is there no way we can build this when we start migration, for example
> replaying the mappings at that time?  Thanks,
> 

In my previous version I tried to go through whole range and find valid 
iotlb, as below:

+        if (memory_region_is_iommu(section->mr)) {
+            iotlb = address_space_get_iotlb_entry(container->space->as, 
iova,
+                                                 true, 
MEMTXATTRS_UNSPECIFIED);

When mapping doesn't exist, qemu throws error as below:

qemu-system-x86_64: vtd_iova_to_slpte: detected slpte permission error 
(iova=0x0, level=0x3, slpte=0x0, write=1)
qemu-system-x86_64: vtd_iommu_translate: detected translation failure 
(dev=00:03:00, iova=0x0)
qemu-system-x86_64: New fault is not recorded due to compression of faults

Secondly, it iterates through whole range with IOMMU page size 
granularity which is 4K, so it takes long time resulting in large 
downtime. With this optimization, downtime with vIOMMU reduced 
significantly.

Other option I will try if I can check that if migration is supported 
then only create this list.

Thanks,
Kirti
Alex Williamson June 25, 2020, 5:40 p.m. UTC | #3
On Thu, 25 Jun 2020 20:04:08 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 6/25/2020 12:25 AM, Alex Williamson wrote:
> > On Sun, 21 Jun 2020 01:51:22 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> Create mapped iova list when vIOMMU is enabled. For each mapped iova
> >> save translated address. Add node to list on MAP and remove node from
> >> list on UNMAP.
> >> This list is used to track dirty pages during migration.  
> > 
> > This seems like a lot of overhead to support that the VM might migrate.
> > Is there no way we can build this when we start migration, for example
> > replaying the mappings at that time?  Thanks,
> >   
> 
> In my previous version I tried to go through whole range and find valid 
> iotlb, as below:
> 
> +        if (memory_region_is_iommu(section->mr)) {
> +            iotlb = address_space_get_iotlb_entry(container->space->as, 
> iova,
> +                                                 true, 
> MEMTXATTRS_UNSPECIFIED);
> 
> When mapping doesn't exist, qemu throws error as below:
> 
> qemu-system-x86_64: vtd_iova_to_slpte: detected slpte permission error 
> (iova=0x0, level=0x3, slpte=0x0, write=1)
> qemu-system-x86_64: vtd_iommu_translate: detected translation failure 
> (dev=00:03:00, iova=0x0)
> qemu-system-x86_64: New fault is not recorded due to compression of faults

My assumption would have been that we use the replay mechanism, which
is known to work because we need to use it when we hot-add a device.
We'd make use of iommu_notifier_init() to create a new handler for this
purpose, then we'd walk our container->giommu_list and call
memory_region_iommu_replay() for each.

Peter, does this sound like the right approach to you?

> Secondly, it iterates through whole range with IOMMU page size 
> granularity which is 4K, so it takes long time resulting in large 
> downtime. With this optimization, downtime with vIOMMU reduced 
> significantly.

Right, but we amortize that overhead and the resulting bloat across the
99.9999% of the time that we're not migrating.  I wonder if we could
startup another thread to handle this when we enable dirty logging.  We
don't really need the result until we start processing the dirty
bitmap, right?  Also, if we're dealing with this many separate pages,
shouldn't we be using a tree rather than a list to give us O(logN)
rather than O(N)?
 
> Other option I will try if I can check that if migration is supported 
> then only create this list.

Wouldn't we still have problems if we start with a guest IOMMU domain
with a device that doesn't support migration, hot-add a device that
does support migration, then hot-remove the original device?  Seems
like our list would only be complete since the migration device was
added.  Thanks,

Alex
Peter Xu June 26, 2020, 2:43 p.m. UTC | #4
On Thu, Jun 25, 2020 at 11:40:39AM -0600, Alex Williamson wrote:
> On Thu, 25 Jun 2020 20:04:08 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 6/25/2020 12:25 AM, Alex Williamson wrote:
> > > On Sun, 21 Jun 2020 01:51:22 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > >> Create mapped iova list when vIOMMU is enabled. For each mapped iova
> > >> save translated address. Add node to list on MAP and remove node from
> > >> list on UNMAP.
> > >> This list is used to track dirty pages during migration.  
> > > 
> > > This seems like a lot of overhead to support that the VM might migrate.
> > > Is there no way we can build this when we start migration, for example
> > > replaying the mappings at that time?  Thanks,
> > >   
> > 
> > In my previous version I tried to go through whole range and find valid 
> > iotlb, as below:
> > 
> > +        if (memory_region_is_iommu(section->mr)) {
> > +            iotlb = address_space_get_iotlb_entry(container->space->as, 
> > iova,
> > +                                                 true, 
> > MEMTXATTRS_UNSPECIFIED);
> > 
> > When mapping doesn't exist, qemu throws error as below:
> > 
> > qemu-system-x86_64: vtd_iova_to_slpte: detected slpte permission error 
> > (iova=0x0, level=0x3, slpte=0x0, write=1)
> > qemu-system-x86_64: vtd_iommu_translate: detected translation failure 
> > (dev=00:03:00, iova=0x0)
> > qemu-system-x86_64: New fault is not recorded due to compression of faults
> 
> My assumption would have been that we use the replay mechanism, which
> is known to work because we need to use it when we hot-add a device.
> We'd make use of iommu_notifier_init() to create a new handler for this
> purpose, then we'd walk our container->giommu_list and call
> memory_region_iommu_replay() for each.
> 
> Peter, does this sound like the right approach to you?

(Sorry I may not have the complete picture of this series, please bear with
 me...)

This seems to be a workable approach to me.  However then we might have a
similar mapping entry cached the 3rd time...  VFIO kernel has a copy initially,
then QEMU vIOMMU has another one (please grep iova_tree in intel_iommu.c).

My wild guess is that the mapping should still be in control in most cases, so
even if we cache it multiple times (for better layering) it would still be
fine.  However since we're in QEMU right now, I'm also thinking whether we can
share the information with the vIOMMU somehow, because even if the page table
entry is wiped off at that time we may still have a chance to use the DMAMap
object that cached in vIOMMU when iommu notify() happens.  Though that may
require some vIOMMU change too (e.g., vtd_page_walk_one may need to postpone
the iova_tree_remove to be after the hook_fn is called, also we may need to
pass the DMAMap object or at least the previous translated addr to the hook
somehow before removal), so maybe that can also be done on top.

> 
> > Secondly, it iterates through whole range with IOMMU page size 
> > granularity which is 4K, so it takes long time resulting in large 
> > downtime. With this optimization, downtime with vIOMMU reduced 
> > significantly.
> 
> Right, but we amortize that overhead and the resulting bloat across the
> 99.9999% of the time that we're not migrating.  I wonder if we could
> startup another thread to handle this when we enable dirty logging.  We
> don't really need the result until we start processing the dirty
> bitmap, right?  Also, if we're dealing with this many separate pages,
> shouldn't we be using a tree rather than a list to give us O(logN)
> rather than O(N)?

Yep I agree.  At least the vIOMMU cache is using gtree.

Btw, IIUC we won't walk the whole range using 4K granularity always, not for
VT-d emulation.  Because vtd_page_walk_level() is smart enough to skip higher
levels of invalid entries so it can jump with 2M/1G/... chunks if the whole
chunk is invalid.

Thanks,
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index e0d3d4585a65..6921a78e9ba5 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -408,8 +408,8 @@  static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 }
 
 /* Called with rcu_read_lock held.  */
-static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
-                           bool *read_only)
+static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
+                               ram_addr_t *ram_addr, bool *read_only)
 {
     MemoryRegion *mr;
     hwaddr xlat;
@@ -440,8 +440,17 @@  static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void **vaddr,
         return false;
     }
 
-    *vaddr = memory_region_get_ram_ptr(mr) + xlat;
-    *read_only = !writable || mr->readonly;
+    if (vaddr) {
+        *vaddr = memory_region_get_ram_ptr(mr) + xlat;
+    }
+
+    if (ram_addr) {
+        *ram_addr = memory_region_get_ram_addr(mr) + xlat;
+    }
+
+    if (read_only) {
+        *read_only = !writable || mr->readonly;
+    }
 
     return true;
 }
@@ -451,7 +460,6 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
     VFIOContainer *container = giommu->container;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
-    bool read_only;
     void *vaddr;
     int ret;
 
@@ -467,7 +475,10 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     rcu_read_lock();
 
     if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
-        if (!vfio_get_vaddr(iotlb, &vaddr, &read_only)) {
+        ram_addr_t ram_addr;
+        bool read_only;
+
+        if (!vfio_get_xlat_addr(iotlb, &vaddr, &ram_addr, &read_only)) {
             goto out;
         }
         /*
@@ -485,8 +496,28 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
                          "0x%"HWADDR_PRIx", %p) = %d (%m)",
                          container, iova,
                          iotlb->addr_mask + 1, vaddr, ret);
+        } else {
+            VFIOIovaRange *iova_range;
+
+            iova_range = g_malloc0(sizeof(*iova_range));
+            iova_range->iova = iova;
+            iova_range->size = iotlb->addr_mask + 1;
+            iova_range->ram_addr = ram_addr;
+
+            QLIST_INSERT_HEAD(&giommu->iova_list, iova_range, next);
         }
     } else {
+        VFIOIovaRange *iova_range, *tmp;
+
+        QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
+            if (iova_range->iova >= iova &&
+                iova_range->iova + iova_range->size <= iova +
+                                                       iotlb->addr_mask + 1) {
+                QLIST_REMOVE(iova_range, next);
+                g_free(iova_range);
+            }
+        }
+
         ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
@@ -643,6 +674,7 @@  static void vfio_listener_region_add(MemoryListener *listener,
             g_free(giommu);
             goto fail;
         }
+        QLIST_INIT(&giommu->iova_list);
         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
         memory_region_iommu_replay(giommu->iommu, &giommu->n);
 
@@ -741,6 +773,13 @@  static void vfio_listener_region_del(MemoryListener *listener,
         QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
             if (MEMORY_REGION(giommu->iommu) == section->mr &&
                 giommu->n.start == section->offset_within_region) {
+                VFIOIovaRange *iova_range, *tmp;
+
+                QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, tmp) {
+                    QLIST_REMOVE(iova_range, next);
+                    g_free(iova_range);
+                }
+
                 memory_region_unregister_iommu_notifier(section->mr,
                                                         &giommu->n);
                 QLIST_REMOVE(giommu, giommu_next);
@@ -1538,6 +1577,13 @@  static void vfio_disconnect_container(VFIOGroup *group)
         QLIST_REMOVE(container, next);
 
         QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
+            VFIOIovaRange *iova_range, *itmp;
+
+            QLIST_FOREACH_SAFE(iova_range, &giommu->iova_list, next, itmp) {
+                QLIST_REMOVE(iova_range, next);
+                g_free(iova_range);
+            }
+
             memory_region_unregister_iommu_notifier(
                     MEMORY_REGION(giommu->iommu), &giommu->n);
             QLIST_REMOVE(giommu, giommu_next);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 5a57a78ec517..56b75e4a8bc4 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -89,11 +89,19 @@  typedef struct VFIOContainer {
     QLIST_ENTRY(VFIOContainer) next;
 } VFIOContainer;
 
+typedef struct VFIOIovaRange {
+    hwaddr iova;
+    size_t size;
+    ram_addr_t ram_addr;
+    QLIST_ENTRY(VFIOIovaRange) next;
+} VFIOIovaRange;
+
 typedef struct VFIOGuestIOMMU {
     VFIOContainer *container;
     IOMMUMemoryRegion *iommu;
     hwaddr iommu_offset;
     IOMMUNotifier n;
+    QLIST_HEAD(, VFIOIovaRange) iova_list;
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 } VFIOGuestIOMMU;