diff mbox series

[v2,17/20] vfio/common: Support device dirty page tracking with vIOMMU

Message ID 20230222174915.5647-18-avihaih@nvidia.com
State New
Headers show
Series vfio: Add migration pre-copy support and device dirty tracking | expand

Commit Message

Avihai Horon Feb. 22, 2023, 5:49 p.m. UTC
Currently, device dirty page tracking with vIOMMU is not supported - RAM
pages are perpetually marked dirty in this case.

When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
the vIOMMU maps/unmaps them. These IOVA ranges can potentially be mapped
anywhere in the vIOMMU IOVA space.

Due to this dynamic nature of vIOMMU mapping/unmapping, tracking only
the currently mapped IOVA ranges, as done in the non-vIOMMU case,
doesn't work very well.

Instead, to support device dirty tracking when vIOMMU is enabled, track
the entire vIOMMU IOVA space. If that fails (IOVA space can be rather
big and we might hit HW limitation), try tracking smaller range while
marking untracked ranges dirty.

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 include/hw/vfio/vfio-common.h |   2 +
 hw/vfio/common.c              | 196 +++++++++++++++++++++++++++++++---
 2 files changed, 181 insertions(+), 17 deletions(-)

Comments

Alex Williamson Feb. 22, 2023, 11:34 p.m. UTC | #1
On Wed, 22 Feb 2023 19:49:12 +0200
Avihai Horon <avihaih@nvidia.com> wrote:

> Currently, device dirty page tracking with vIOMMU is not supported - RAM
> pages are perpetually marked dirty in this case.
> 
> When vIOMMU is used, IOVA ranges are DMA mapped/unmapped on the fly as
> the vIOMMU maps/unmaps them. These IOVA ranges can potentially be mapped
> anywhere in the vIOMMU IOVA space.
> 
> Due to this dynamic nature of vIOMMU mapping/unmapping, tracking only
> the currently mapped IOVA ranges, as done in the non-vIOMMU case,
> doesn't work very well.
> 
> Instead, to support device dirty tracking when vIOMMU is enabled, track
> the entire vIOMMU IOVA space. If that fails (IOVA space can be rather
> big and we might hit HW limitation), try tracking smaller range while
> marking untracked ranges dirty.
> 
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
>  include/hw/vfio/vfio-common.h |   2 +
>  hw/vfio/common.c              | 196 +++++++++++++++++++++++++++++++---
>  2 files changed, 181 insertions(+), 17 deletions(-)
> 
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 1f21e1fa43..1dc00cabcd 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -95,6 +95,8 @@ typedef struct VFIOContainer {
>      unsigned int dma_max_mappings;
>      IOVATree *mappings;
>      QemuMutex mappings_mutex;
> +    /* Represents the range [0, giommu_tracked_range) not inclusive */
> +    hwaddr giommu_tracked_range;
>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>      QLIST_HEAD(, VFIOGroup) group_list;
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 4a7fff6eeb..1024788bcc 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -45,6 +45,8 @@
>  #include "migration/qemu-file.h"
>  #include "sysemu/tpm.h"
>  #include "qemu/iova-tree.h"
> +#include "hw/boards.h"
> +#include "hw/mem/memory-device.h"
>  
>  VFIOGroupList vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
> @@ -430,6 +432,38 @@ void vfio_unblock_multiple_devices_migration(void)
>      multiple_devices_migration_blocker = NULL;
>  }
>  
> +static uint64_t vfio_get_ram_size(void)
> +{
> +    MachineState *ms = MACHINE(qdev_get_machine());
> +    uint64_t plugged_size;
> +
> +    plugged_size = get_plugged_memory_size();
> +    if (plugged_size == (uint64_t)-1) {
> +        plugged_size = 0;
> +    }
> +
> +    return ms->ram_size + plugged_size;
> +}
> +
> +static int vfio_iommu_get_max_iova(VFIOContainer *container, hwaddr *max_iova)
> +{
> +    VFIOGuestIOMMU *giommu;
> +    int ret;
> +
> +    giommu = QLIST_FIRST(&container->giommu_list);
> +    if (!giommu) {
> +        return -ENOENT;
> +    }
> +
> +    ret = memory_region_iommu_get_attr(giommu->iommu_mr, IOMMU_ATTR_MAX_IOVA,
> +                                       max_iova);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    return 0;
> +}
> +
>  static bool vfio_have_giommu(VFIOContainer *container)
>  {
>      return !QLIST_EMPTY(&container->giommu_list);
> @@ -1510,7 +1544,8 @@ static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data)
>  }
>  
>  static struct vfio_device_feature *
> -vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
> +vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
> +                                             bool giommu)
>  {
>      struct vfio_device_feature *feature;
>      size_t feature_size;
> @@ -1529,6 +1564,16 @@ vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
>      control = (struct vfio_device_feature_dma_logging_control *)feature->data;
>      control->page_size = qemu_real_host_page_size();
>  
> +    if (giommu) {
> +        ranges = g_malloc0(sizeof(*ranges));
> +        ranges->iova = 0;
> +        ranges->length = container->giommu_tracked_range;
> +        control->num_ranges = 1;
> +        control->ranges = (uint64_t)ranges;
> +
> +        return feature;
> +    }
> +
>      QEMU_LOCK_GUARD(&container->mappings_mutex);
>  
>      /*
> @@ -1578,12 +1623,12 @@ static void vfio_device_feature_dma_logging_start_destroy(
>      g_free(feature);
>  }
>  
> -static int vfio_devices_dma_logging_start(VFIOContainer *container)
> +static int vfio_devices_dma_logging_start(VFIOContainer *container, bool giommu)
>  {
>      struct vfio_device_feature *feature;
>      int ret;
>  
> -    feature = vfio_device_feature_dma_logging_start_create(container);
> +    feature = vfio_device_feature_dma_logging_start_create(container, giommu);
>      if (!feature) {
>          return -errno;
>      }
> @@ -1598,18 +1643,128 @@ static int vfio_devices_dma_logging_start(VFIOContainer *container)
>      return ret;
>  }
>  
> +typedef struct {
> +    hwaddr *ranges;
> +    unsigned int ranges_num;
> +} VFIOGIOMMUDeviceDTRanges;
> +
> +/*
> + * This value is used in the second attempt to start device dirty tracking with
> + * vIOMMU, or if the giommu fails to report its max iova.
> + * It should be in the middle, not too big and not too small, allowing devices
> + * with HW limitations to do device dirty tracking while covering a fair amount
> + * of the IOVA space.
> + *
> + * This arbitrary value was chosen becasue it is the minimum value of Intel
> + * IOMMU max IOVA and mlx5 devices support tracking a range of this size.
> + */
> +#define VFIO_IOMMU_DEFAULT_MAX_IOVA ((1ULL << 39) - 1)
> +
> +#define VFIO_IOMMU_RANGES_NUM 3
> +static VFIOGIOMMUDeviceDTRanges *
> +vfio_iommu_device_dirty_tracking_ranges_create(VFIOContainer *container)
> +{
> +    hwaddr iommu_max_iova = VFIO_IOMMU_DEFAULT_MAX_IOVA;
> +    hwaddr retry_iova;
> +    hwaddr ram_size = vfio_get_ram_size();
> +    VFIOGIOMMUDeviceDTRanges *dt_ranges;
> +    int ret;
> +
> +    dt_ranges = g_try_new0(VFIOGIOMMUDeviceDTRanges, 1);
> +    if (!dt_ranges) {
> +        errno = ENOMEM;
> +
> +        return NULL;
> +    }
> +
> +    dt_ranges->ranges_num = VFIO_IOMMU_RANGES_NUM;
> +
> +    dt_ranges->ranges = g_try_new0(hwaddr, dt_ranges->ranges_num);
> +    if (!dt_ranges->ranges) {
> +        g_free(dt_ranges);
> +        errno = ENOMEM;
> +
> +        return NULL;
> +    }
> +
> +    /*
> +     * With vIOMMU we try to track the entire IOVA space. As the IOVA space can
> +     * be rather big, devices might not be able to track it due to HW
> +     * limitations. In that case:
> +     * (1) Retry tracking a smaller part of the IOVA space.
> +     * (2) Retry tracking a range in the size of the physical memory.

This looks really sketchy, why do we think there's a "good enough"
value here?  If we get it wrong, the device potentially has access to
IOVA space that we're not tracking, right?

I'd think the only viable fallback if the vIOMMU doesn't report its max
IOVA is the full 64-bit address space, otherwise it seems like we need
to add a migration blocker.

BTW, virtio-iommu is actively working to support vfio devices, we
should include support for it as well as VT-d.  Thanks,

Alex

> +     */
> +    ret = vfio_iommu_get_max_iova(container, &iommu_max_iova);
> +    if (!ret) {
> +        /* Check 2^64 wrap around */
> +        if (!REAL_HOST_PAGE_ALIGN(iommu_max_iova)) {
> +            iommu_max_iova -= qemu_real_host_page_size();
> +        }
> +    }
> +
> +    retry_iova = MIN(iommu_max_iova / 2, VFIO_IOMMU_DEFAULT_MAX_IOVA);
> +
> +    dt_ranges->ranges[0] = REAL_HOST_PAGE_ALIGN(iommu_max_iova);
> +    dt_ranges->ranges[1] = REAL_HOST_PAGE_ALIGN(retry_iova);
> +    dt_ranges->ranges[2] = REAL_HOST_PAGE_ALIGN(MIN(ram_size, retry_iova / 2));
> +
> +    return dt_ranges;
> +}
> +
> +static void vfio_iommu_device_dirty_tracking_ranges_destroy(
> +    VFIOGIOMMUDeviceDTRanges *dt_ranges)
> +{
> +    g_free(dt_ranges->ranges);
> +    g_free(dt_ranges);
> +}
> +
> +static int vfio_devices_start_dirty_page_tracking(VFIOContainer *container)
> +{
> +    VFIOGIOMMUDeviceDTRanges *dt_ranges;
> +    int ret;
> +    int i;
> +
> +    if (!vfio_have_giommu(container)) {
> +        return vfio_devices_dma_logging_start(container, false);
> +    }
> +
> +    dt_ranges = vfio_iommu_device_dirty_tracking_ranges_create(container);
> +    if (!dt_ranges) {
> +        return -errno;
> +    }
> +
> +    for (i = 0; i < dt_ranges->ranges_num; i++) {
> +        container->giommu_tracked_range = dt_ranges->ranges[i];
> +        ret = vfio_devices_dma_logging_start(container, true);
> +        if (!ret) {
> +            break;
> +        }
> +
> +        if (i < dt_ranges->ranges_num - 1) {
> +            warn_report("Failed to start device dirty tracking with vIOMMU "
> +                        "with range of size 0x%" HWADDR_PRIx
> +                        ", err: %d. Retrying with range "
> +                        "of size 0x%" HWADDR_PRIx,
> +                        dt_ranges->ranges[i], ret, dt_ranges->ranges[i + 1]);
> +        } else {
> +            error_report("Failed to start device dirty tracking with vIOMMU "
> +                         "with range of size 0x%" HWADDR_PRIx ", err: %d",
> +                         dt_ranges->ranges[i], ret);
> +        }
> +    }
> +
> +    vfio_iommu_device_dirty_tracking_ranges_destroy(dt_ranges);
> +
> +    return ret;
> +}
> +
>  static void vfio_listener_log_global_start(MemoryListener *listener)
>  {
>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>      int ret;
>  
>      if (vfio_devices_all_device_dirty_tracking(container)) {
> -        if (vfio_have_giommu(container)) {
> -            /* Device dirty page tracking currently doesn't support vIOMMU */
> -            return;
> -        }
> -
> -        ret = vfio_devices_dma_logging_start(container);
> +        ret = vfio_devices_start_dirty_page_tracking(container);
>      } else {
>          ret = vfio_set_dirty_page_tracking(container, true);
>      }
> @@ -1627,11 +1782,6 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
>      int ret;
>  
>      if (vfio_devices_all_device_dirty_tracking(container)) {
> -        if (vfio_have_giommu(container)) {
> -            /* Device dirty page tracking currently doesn't support vIOMMU */
> -            return;
> -        }
> -
>          ret = vfio_devices_dma_logging_stop(container);
>      } else {
>          ret = vfio_set_dirty_page_tracking(container, false);
> @@ -1670,6 +1820,17 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
>      return 0;
>  }
>  
> +static bool vfio_iommu_range_is_device_tracked(VFIOContainer *container,
> +                                               hwaddr iova, hwaddr size)
> +{
> +    /* Check for 2^64 wrap around */
> +    if (!(iova + size)) {
> +        return false;
> +    }
> +
> +    return iova + size <= container->giommu_tracked_range;
> +}
> +
>  static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
>                                             VFIOBitmap *vbmap, hwaddr iova,
>                                             hwaddr size)
> @@ -1679,10 +1840,11 @@ static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
>      int ret;
>  
>      if (vfio_have_giommu(container)) {
> -        /* Device dirty page tracking currently doesn't support vIOMMU */
> -        bitmap_set(vbmap->bitmap, 0, vbmap->pages);
> +        if (!vfio_iommu_range_is_device_tracked(container, iova, size)) {
> +            bitmap_set(vbmap->bitmap, 0, vbmap->pages);
>  
> -        return 0;
> +            return 0;
> +        }
>      }
>  
>      QLIST_FOREACH(group, &container->group_list, container_next) {
Jason Gunthorpe Feb. 23, 2023, 2:08 a.m. UTC | #2
On Wed, Feb 22, 2023 at 04:34:39PM -0700, Alex Williamson wrote:
> > +    /*
> > +     * With vIOMMU we try to track the entire IOVA space. As the IOVA space can
> > +     * be rather big, devices might not be able to track it due to HW
> > +     * limitations. In that case:
> > +     * (1) Retry tracking a smaller part of the IOVA space.
> > +     * (2) Retry tracking a range in the size of the physical memory.
> 
> This looks really sketchy, why do we think there's a "good enough"
> value here?  If we get it wrong, the device potentially has access to
> IOVA space that we're not tracking, right?

The idea was the untracked range becomes permanently dirty, so at
worst this means the migration never converges.
 
#2 is the presumption that the guest is using an identity map.

> I'd think the only viable fallback if the vIOMMU doesn't report its max
> IOVA is the full 64-bit address space, otherwise it seems like we need
> to add a migration blocker.

This is basically saying vIOMMU doesn't work with migration, and we've
heard that this isn't OK. There are cases where vIOMMU is on but the
guest always uses identity maps. eg for virtual interrupt remapping.

We also have future problems that nested translation is incompatible
with device dirty tracking..

Jason
Alex Williamson Feb. 23, 2023, 8:06 p.m. UTC | #3
On Wed, 22 Feb 2023 22:08:33 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Feb 22, 2023 at 04:34:39PM -0700, Alex Williamson wrote:
> > > +    /*
> > > +     * With vIOMMU we try to track the entire IOVA space. As the IOVA space can
> > > +     * be rather big, devices might not be able to track it due to HW
> > > +     * limitations. In that case:
> > > +     * (1) Retry tracking a smaller part of the IOVA space.
> > > +     * (2) Retry tracking a range in the size of the physical memory.  
> > 
> > This looks really sketchy, why do we think there's a "good enough"
> > value here?  If we get it wrong, the device potentially has access to
> > IOVA space that we're not tracking, right?  
> 
> The idea was the untracked range becomes permanently dirty, so at
> worst this means the migration never converges.

I didn't spot the mechanics where that's implemented, I'll look again.
 
> #2 is the presumption that the guest is using an identity map.

This is a dangerous assumption.

> > I'd think the only viable fallback if the vIOMMU doesn't report its max
> > IOVA is the full 64-bit address space, otherwise it seems like we need
> > to add a migration blocker.  
> 
> This is basically saying vIOMMU doesn't work with migration, and we've
> heard that this isn't OK. There are cases where vIOMMU is on but the
> guest always uses identity maps. eg for virtual interrupt remapping.

Yes, the vIOMMU can be automatically added to a VM when we exceed 255
vCPUs, but I don't see how we can therefore deduce anything about the
usage mode of the vIOMMU.  Users also make use of vfio with vIOMMU for
nested assignment, ie. userspace drivers running within the guest,
where making assumptions about the IOVA extents of the userspace driver
seems dangerous.

Let's backup though, if a device doesn't support the full address width
of the platform, it's the responsibility of the device driver to
implement a DMA mask such that the device is never asked to DMA outside
of its address space support.  Therefore how could a device ever dirty
pages outside of its own limitations?

Isn't it reasonable to require that a device support dirty tracking for
the entire extent if its DMA address width in order to support this
feature?

If we can make those assumptions, then the vfio driver should happily
accept a range exceeding the device's DMA address width capabilities,
knowing that the device cannot dirty anything beyond its addressable
range.

> We also have future problems that nested translation is incompatible
> with device dirty tracking..

:-\  Thanks,

Alex
Jason Gunthorpe Feb. 23, 2023, 8:55 p.m. UTC | #4
On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote:
> > #2 is the presumption that the guest is using an identity map.
> 
> This is a dangerous assumption.
> 
> > > I'd think the only viable fallback if the vIOMMU doesn't report its max
> > > IOVA is the full 64-bit address space, otherwise it seems like we need
> > > to add a migration blocker.  
> > 
> > This is basically saying vIOMMU doesn't work with migration, and we've
> > heard that this isn't OK. There are cases where vIOMMU is on but the
> > guest always uses identity maps. eg for virtual interrupt remapping.
> 
> Yes, the vIOMMU can be automatically added to a VM when we exceed 255
> vCPUs, but I don't see how we can therefore deduce anything about the
> usage mode of the vIOMMU.  

We just loose optimizations. Any mappings that are established outside
the dirty tracking range are permanently dirty. So at worst the guest
can block migration by establishing bad mappings. It is not exactly
production quality but it is still useful for a closed environment
with known guest configurations.

> nested assignment, ie. userspace drivers running within the guest,
> where making assumptions about the IOVA extents of the userspace driver
> seems dangerous.
>
> Let's backup though, if a device doesn't support the full address width
> of the platform, it's the responsibility of the device driver to
> implement a DMA mask such that the device is never asked to DMA outside
> of its address space support.  Therefore how could a device ever dirty
> pages outside of its own limitations?

The device always supports the full address space. We can't enforce
any kind of limit on the VM

It just can't dirty track it all.

> Isn't it reasonable to require that a device support dirty tracking for
> the entire extent if its DMA address width in order to support this
> feature?

No, 2**64 is too big a number to be reasonable.

Ideally we'd work it the other way and tell the vIOMMU that the vHW
only supports a limited number of address bits for the translation, eg
through the ACPI tables. Then the dirty tracking could safely cover
the larger of all system memory or the limited IOVA address space.

Or even better figure out how to get interrupt remapping without IOMMU
support :\

Jason
Joao Martins Feb. 23, 2023, 9:30 p.m. UTC | #5
On 23/02/2023 20:55, Jason Gunthorpe wrote:
> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote:
>>> #2 is the presumption that the guest is using an identity map.
>> Isn't it reasonable to require that a device support dirty tracking for
>> the entire extent if its DMA address width in order to support this
>> feature?
> 
> No, 2**64 is too big a number to be reasonable.
> 
+1

> Ideally we'd work it the other way and tell the vIOMMU that the vHW
> only supports a limited number of address bits for the translation, eg
> through the ACPI tables. Then the dirty tracking could safely cover
> the larger of all system memory or the limited IOVA address space.
> 
> Or even better figure out how to get interrupt remapping without IOMMU
> support :\

FWIW That's generally my use of `iommu=pt` because all I want is interrupt
remapping, not the DMA remapping part. And this is going to be specially
relevant with these new boxes that easily surprass the >255 dedicated physical
CPUs mark with just two sockets.

The only other alternative I could see is to rely on IOMMU attribute for DMA
translation. Today you can actually toggle that 'off' in VT-d (and I can imagine
the same thing working for AMD-vIOMMU). In Intel it just omits the 39
Address-width cap. And it means it doesn't have virtual addressing. Similar to
what Avihai already does for MAX_IOVA, we would do for DMA_TRANSLATION, and let
each vIOMMU implementation support that.

But to be honest I am not sure how robust relying on that is as that doesn't
really represent a hardware implementation. Without vIOMMU you have a (KVM) PV
op in new *guest* kernels that (ab)uses some unused bits in IOAPIC for a 24-bit
DestID. But this is only on new guests and hypervisors, old *guests* running
older < 5.15 kernels won't work.

... So iommu=pt really is the most convenient right now :/

	Joao
Alex Williamson Feb. 23, 2023, 10:33 p.m. UTC | #6
On Thu, 23 Feb 2023 16:55:54 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote:
> > > #2 is the presumption that the guest is using an identity map.  
> > 
> > This is a dangerous assumption.
> >   
> > > > I'd think the only viable fallback if the vIOMMU doesn't report its max
> > > > IOVA is the full 64-bit address space, otherwise it seems like we need
> > > > to add a migration blocker.    
> > > 
> > > This is basically saying vIOMMU doesn't work with migration, and we've
> > > heard that this isn't OK. There are cases where vIOMMU is on but the
> > > guest always uses identity maps. eg for virtual interrupt remapping.  
> > 
> > Yes, the vIOMMU can be automatically added to a VM when we exceed 255
> > vCPUs, but I don't see how we can therefore deduce anything about the
> > usage mode of the vIOMMU.    
> 
> We just loose optimizations. Any mappings that are established outside
> the dirty tracking range are permanently dirty. So at worst the guest
> can block migration by establishing bad mappings. It is not exactly
> production quality but it is still useful for a closed environment
> with known guest configurations.

That doesn't seem to be what happens in this series, nor does it really
make sense to me that userspace would simply decide to truncate the
dirty tracking ranges array.

> > nested assignment, ie. userspace drivers running within the guest,
> > where making assumptions about the IOVA extents of the userspace driver
> > seems dangerous.
> >
> > Let's backup though, if a device doesn't support the full address width
> > of the platform, it's the responsibility of the device driver to
> > implement a DMA mask such that the device is never asked to DMA outside
> > of its address space support.  Therefore how could a device ever dirty
> > pages outside of its own limitations?  
> 
> The device always supports the full address space. We can't enforce
> any kind of limit on the VM
> 
> It just can't dirty track it all.
> 
> > Isn't it reasonable to require that a device support dirty tracking for
> > the entire extent if its DMA address width in order to support this
> > feature?  
> 
> No, 2**64 is too big a number to be reasonable.

So what are the actual restrictions were dealing with here?  I think it
would help us collaborate on a solution if we didn't have these device
specific restrictions sprinkled through the base implementation.

> Ideally we'd work it the other way and tell the vIOMMU that the vHW
> only supports a limited number of address bits for the translation, eg
> through the ACPI tables. Then the dirty tracking could safely cover
> the larger of all system memory or the limited IOVA address space.

Why can't we do that?  Hotplug is an obvious issue, but maybe it's not
vHW telling the vIOMMU a restriction, maybe it's a QEMU machine or
vIOMMU option and if it's not set to something the device can support,
migration is blocked.
 
> Or even better figure out how to get interrupt remapping without IOMMU
> support :\

-machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \
-device intel-iommu,caching-mode=on,intremap=on

Thanks,
Alex
Jason Gunthorpe Feb. 23, 2023, 11:26 p.m. UTC | #7
On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote:
> On Thu, 23 Feb 2023 16:55:54 -0400
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote:
> > > > #2 is the presumption that the guest is using an identity map.  
> > > 
> > > This is a dangerous assumption.
> > >   
> > > > > I'd think the only viable fallback if the vIOMMU doesn't report its max
> > > > > IOVA is the full 64-bit address space, otherwise it seems like we need
> > > > > to add a migration blocker.    
> > > > 
> > > > This is basically saying vIOMMU doesn't work with migration, and we've
> > > > heard that this isn't OK. There are cases where vIOMMU is on but the
> > > > guest always uses identity maps. eg for virtual interrupt remapping.  
> > > 
> > > Yes, the vIOMMU can be automatically added to a VM when we exceed 255
> > > vCPUs, but I don't see how we can therefore deduce anything about the
> > > usage mode of the vIOMMU.    
> > 
> > We just loose optimizations. Any mappings that are established outside
> > the dirty tracking range are permanently dirty. So at worst the guest
> > can block migration by establishing bad mappings. It is not exactly
> > production quality but it is still useful for a closed environment
> > with known guest configurations.
> 
> That doesn't seem to be what happens in this series, 

Seems like something is missed then

> nor does it really make sense to me that userspace would simply
> decide to truncate the dirty tracking ranges array.

Who else would do it?

> > No, 2**64 is too big a number to be reasonable.
> 
> So what are the actual restrictions were dealing with here?  I think it
> would help us collaborate on a solution if we didn't have these device
> specific restrictions sprinkled through the base implementation.

Hmm? It was always like this, the driver gets to decide if it accepts
the proprosed tracking ranges or not. Given how the implementation has
to work there is no device that could do 2**64...

At least for mlx5 it is in the multi-TB range. Enough for physical
memory on any real server.

> > Ideally we'd work it the other way and tell the vIOMMU that the vHW
> > only supports a limited number of address bits for the translation, eg
> > through the ACPI tables. Then the dirty tracking could safely cover
> > the larger of all system memory or the limited IOVA address space.
> 
> Why can't we do that?  Hotplug is an obvious issue, but maybe it's not
> vHW telling the vIOMMU a restriction, maybe it's a QEMU machine or
> vIOMMU option and if it's not set to something the device can support,
> migration is blocked.

I don't know, maybe we should if we can.

> > Or even better figure out how to get interrupt remapping without IOMMU
> > support :\
> 
> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \
> -device intel-iommu,caching-mode=on,intremap=on

Joao?

If this works lets just block migration if the vIOMMU is turned on..

Jason
Joao Martins Feb. 24, 2023, 11:25 a.m. UTC | #8
On 23/02/2023 23:26, Jason Gunthorpe wrote:
> On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote:
>> On Thu, 23 Feb 2023 16:55:54 -0400
>> Jason Gunthorpe <jgg@nvidia.com> wrote:
>>> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote:
>>> Or even better figure out how to get interrupt remapping without IOMMU
>>> support :\
>>
>> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \
>> -device intel-iommu,caching-mode=on,intremap=on
> 
> Joao?
> 
> If this works lets just block migration if the vIOMMU is turned on..

At a first glance, this looked like my regular iommu incantation.

But reading the code this ::bypass_iommu (new to me) apparently tells that
vIOMMU is bypassed or not for the PCI devices all the way to avoiding
enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether
PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and
such.

You can see from the other email that all of the other options in my head were
either bit inconvenient or risky. I wasn't aware of this option for what is
worth -- much simpler, should work!

And avoiding vIOMMU simplifies the whole patchset too, if it's OK to add a live
migration blocker if `bypass_iommu` is off for any PCI device.

	Joao
Joao Martins Feb. 24, 2023, 12:53 p.m. UTC | #9
On 24/02/2023 11:25, Joao Martins wrote:
> On 23/02/2023 23:26, Jason Gunthorpe wrote:
>> On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote:
>>> On Thu, 23 Feb 2023 16:55:54 -0400
>>> Jason Gunthorpe <jgg@nvidia.com> wrote:
>>>> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote:
>>>> Or even better figure out how to get interrupt remapping without IOMMU
>>>> support :\
>>>
>>> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \
>>> -device intel-iommu,caching-mode=on,intremap=on
>>
>> Joao?
>>
>> If this works lets just block migration if the vIOMMU is turned on..
> 
> At a first glance, this looked like my regular iommu incantation.
> 
> But reading the code this ::bypass_iommu (new to me) apparently tells that
> vIOMMU is bypassed or not for the PCI devices all the way to avoiding
> enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether
> PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and
> such.
> 
> You can see from the other email that all of the other options in my head were
> either bit inconvenient or risky. I wasn't aware of this option for what is
> worth -- much simpler, should work!
>

I say *should*, but on a second thought interrupt remapping may still be
required to one of these devices that are IOMMU-bypassed. Say to put affinities
to vcpus above 255? I was trying this out with more than 255 vcpus with a couple
VFs and at a first glance these VFs fail to probe (these are CX6 VFs).

It is a working setup without the parameter, but now adding a
default_bus_bypass_iommu=on fails to init VFs:

[   32.412733] mlx5_core 0000:00:02.0: Rate limit: 127 rates are supported,
range: 0Mbps to 97656Mbps
[   32.416242] mlx5_core 0000:00:02.0: mlx5_load:1204:(pid 3361): Failed to
alloc IRQs
[   33.227852] mlx5_core 0000:00:02.0: probe_one:1684:(pid 3361): mlx5_init_one
failed with error code -19
[   33.242182] mlx5_core 0000:00:03.0: firmware version: 22.31.1660
[   33.415876] mlx5_core 0000:00:03.0: Rate limit: 127 rates are supported,
range: 0Mbps to 97656Mbps
[   33.448016] mlx5_core 0000:00:03.0: mlx5_load:1204:(pid 3361): Failed to
alloc IRQs
[   34.207532] mlx5_core 0000:00:03.0: probe_one:1684:(pid 3361): mlx5_init_one
failed with error code -19

I haven't dived yet into why it fails.

> And avoiding vIOMMU simplifies the whole patchset too, if it's OK to add a live
> migration blocker if `bypass_iommu` is off for any PCI device.
> 

Still we could have for starters a live migration blocker until we revisit the
vIOMMU case ... should we deem that the default_bus_bypass_iommu=on or the
others I suggested as non-options?
Jason Gunthorpe Feb. 24, 2023, 3:47 p.m. UTC | #10
On Fri, Feb 24, 2023 at 12:53:26PM +0000, Joao Martins wrote:
> > But reading the code this ::bypass_iommu (new to me) apparently tells that
> > vIOMMU is bypassed or not for the PCI devices all the way to avoiding
> > enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether
> > PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and
> > such.
> > 
> > You can see from the other email that all of the other options in my head were
> > either bit inconvenient or risky. I wasn't aware of this option for what is
> > worth -- much simpler, should work!
> >
> 
> I say *should*, but on a second thought interrupt remapping may still be
> required to one of these devices that are IOMMU-bypassed. Say to put affinities
> to vcpus above 255? I was trying this out with more than 255 vcpus with a couple
> VFs and at a first glance these VFs fail to probe (these are CX6
> VFs).

It is pretty bizarre, but the Intel iommu driver is responsible for
installing the interrupt remapping irq driver on the devices.

So if there is no iommu driver bound then there won't be any interrupt
remapping capability for the device even if the interrupt remapping HW
is otherwise setup.

The only reason Avihai is touching this is to try and keep the
interrupt remapping emulation usable, we could certainly punt on that
for now if it looks too ugly.

Jason
Alex Williamson Feb. 24, 2023, 3:56 p.m. UTC | #11
On Fri, 24 Feb 2023 12:53:26 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 24/02/2023 11:25, Joao Martins wrote:
> > On 23/02/2023 23:26, Jason Gunthorpe wrote:  
> >> On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote:  
> >>> On Thu, 23 Feb 2023 16:55:54 -0400
> >>> Jason Gunthorpe <jgg@nvidia.com> wrote:  
> >>>> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote:
> >>>> Or even better figure out how to get interrupt remapping without IOMMU
> >>>> support :\  
> >>>
> >>> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \
> >>> -device intel-iommu,caching-mode=on,intremap=on  
> >>
> >> Joao?
> >>
> >> If this works lets just block migration if the vIOMMU is turned on..  
> > 
> > At a first glance, this looked like my regular iommu incantation.
> > 
> > But reading the code this ::bypass_iommu (new to me) apparently tells that
> > vIOMMU is bypassed or not for the PCI devices all the way to avoiding
> > enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether
> > PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and
> > such.
> > 
> > You can see from the other email that all of the other options in my head were
> > either bit inconvenient or risky. I wasn't aware of this option for what is
> > worth -- much simpler, should work!
> >  
> 
> I say *should*, but on a second thought interrupt remapping may still be
> required to one of these devices that are IOMMU-bypassed. Say to put affinities
> to vcpus above 255? I was trying this out with more than 255 vcpus with a couple
> VFs and at a first glance these VFs fail to probe (these are CX6 VFs).
> 
> It is a working setup without the parameter, but now adding a
> default_bus_bypass_iommu=on fails to init VFs:
> 
> [   32.412733] mlx5_core 0000:00:02.0: Rate limit: 127 rates are supported,
> range: 0Mbps to 97656Mbps
> [   32.416242] mlx5_core 0000:00:02.0: mlx5_load:1204:(pid 3361): Failed to
> alloc IRQs
> [   33.227852] mlx5_core 0000:00:02.0: probe_one:1684:(pid 3361): mlx5_init_one
> failed with error code -19
> [   33.242182] mlx5_core 0000:00:03.0: firmware version: 22.31.1660
> [   33.415876] mlx5_core 0000:00:03.0: Rate limit: 127 rates are supported,
> range: 0Mbps to 97656Mbps
> [   33.448016] mlx5_core 0000:00:03.0: mlx5_load:1204:(pid 3361): Failed to
> alloc IRQs
> [   34.207532] mlx5_core 0000:00:03.0: probe_one:1684:(pid 3361): mlx5_init_one
> failed with error code -19
> 
> I haven't dived yet into why it fails.

Hmm, I was thinking this would only affect DMA, but on second thought
I think the DRHD also describes the interrupt remapping hardware and
while interrupt remapping is an optional feature of the DRHD, DMA
remapping is always supported afaict.  I saw IR vectors in
/proc/interrupts and thought it worked, but indeed an assigned device
is having trouble getting vectors.

> 
> > And avoiding vIOMMU simplifies the whole patchset too, if it's OK to add a live
> > migration blocker if `bypass_iommu` is off for any PCI device.
> >   
> 
> Still we could have for starters a live migration blocker until we revisit the
> vIOMMU case ... should we deem that the default_bus_bypass_iommu=on or the
> others I suggested as non-options?

I'm very uncomfortable presuming a vIOMMU usage model, especially when
it leads to potentially untracked DMA if our assumptions are violated.
We could use a MemoryListener on the IOVA space to record a high level
mark, but we'd need to continue to monitor that mark while we're in
pre-copy and I don't think anyone would agree that a migratable VM can
suddenly become unmigratable due to a random IOVA allocation would be
supportable.  That leads me to think that a machine option to limit the
vIOMMU address space, and testing that against the device prior to
declaring migration support of the device is possibly our best option.

Is that feasible?  Do all the vIOMMU models have a means to limit the
IOVA space?  How does QEMU learn a limit for a given device?  We
probably need to think about whether there are devices that can even
support the guest physical memory ranges when we start relocating RAM
to arbitrary addresses (ex. hypertransport).  Can we infer anything
from the vCPU virtual address space or is that still an unreasonable
range to track for devices?  Thanks,

Alex
Joao Martins Feb. 24, 2023, 7:16 p.m. UTC | #12
On 24/02/2023 15:56, Alex Williamson wrote:
> On Fri, 24 Feb 2023 12:53:26 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> On 24/02/2023 11:25, Joao Martins wrote:
>>> On 23/02/2023 23:26, Jason Gunthorpe wrote:  
>>>> On Thu, Feb 23, 2023 at 03:33:09PM -0700, Alex Williamson wrote:  
>>>>> On Thu, 23 Feb 2023 16:55:54 -0400
>>>>> Jason Gunthorpe <jgg@nvidia.com> wrote:  
>>>>>> On Thu, Feb 23, 2023 at 01:06:33PM -0700, Alex Williamson wrote:
>>>>>> Or even better figure out how to get interrupt remapping without IOMMU
>>>>>> support :\  
>>>>>
>>>>> -machine q35,default_bus_bypass_iommu=on,kernel-irqchip=split \
>>>>> -device intel-iommu,caching-mode=on,intremap=on  
>>>>
>>>> Joao?
>>>>
>>>> If this works lets just block migration if the vIOMMU is turned on..  
>>>
>>> At a first glance, this looked like my regular iommu incantation.
>>>
>>> But reading the code this ::bypass_iommu (new to me) apparently tells that
>>> vIOMMU is bypassed or not for the PCI devices all the way to avoiding
>>> enumerating in the IVRS/DMAR ACPI tables. And I see VFIO double-checks whether
>>> PCI device is within the IOMMU address space (or bypassed) prior to DMA maps and
>>> such.
>>>
>>> You can see from the other email that all of the other options in my head were
>>> either bit inconvenient or risky. I wasn't aware of this option for what is
>>> worth -- much simpler, should work!
>>>  
>>
>> I say *should*, but on a second thought interrupt remapping may still be
>> required to one of these devices that are IOMMU-bypassed. Say to put affinities
>> to vcpus above 255? I was trying this out with more than 255 vcpus with a couple
>> VFs and at a first glance these VFs fail to probe (these are CX6 VFs).
>>
>> It is a working setup without the parameter, but now adding a
>> default_bus_bypass_iommu=on fails to init VFs:
>>
>> [   32.412733] mlx5_core 0000:00:02.0: Rate limit: 127 rates are supported,
>> range: 0Mbps to 97656Mbps
>> [   32.416242] mlx5_core 0000:00:02.0: mlx5_load:1204:(pid 3361): Failed to
>> alloc IRQs
>> [   33.227852] mlx5_core 0000:00:02.0: probe_one:1684:(pid 3361): mlx5_init_one
>> failed with error code -19
>> [   33.242182] mlx5_core 0000:00:03.0: firmware version: 22.31.1660
>> [   33.415876] mlx5_core 0000:00:03.0: Rate limit: 127 rates are supported,
>> range: 0Mbps to 97656Mbps
>> [   33.448016] mlx5_core 0000:00:03.0: mlx5_load:1204:(pid 3361): Failed to
>> alloc IRQs
>> [   34.207532] mlx5_core 0000:00:03.0: probe_one:1684:(pid 3361): mlx5_init_one
>> failed with error code -19
>>
>> I haven't dived yet into why it fails.
> 
> Hmm, I was thinking this would only affect DMA, but on second thought
> I think the DRHD also describes the interrupt remapping hardware and
> while interrupt remapping is an optional feature of the DRHD, DMA
> remapping is always supported afaict.  I saw IR vectors in
> /proc/interrupts and thought it worked, but indeed an assigned device
> is having trouble getting vectors.
> 

AMD/IVRS might be a little different.

I also tried disabling dma-translation from IOMMU feature as I had mentioned in
another email, and that renders the same result as default_bus_bypass_iommu.

So it's either this KVM pv-op (which is not really interrupt remapping and it's
x86 specific) or full vIOMMU. The PV op[*] has the natural disadvantage of
requiring a compatible guest kernel.

[*] See, KVM_FEATURE_MSI_EXT_DEST_ID.

>>
>>> And avoiding vIOMMU simplifies the whole patchset too, if it's OK to add a live
>>> migration blocker if `bypass_iommu` is off for any PCI device.
>>>   
>>
>> Still we could have for starters a live migration blocker until we revisit the
>> vIOMMU case ... should we deem that the default_bus_bypass_iommu=on or the
>> others I suggested as non-options?
> 
> I'm very uncomfortable presuming a vIOMMU usage model, especially when
> it leads to potentially untracked DMA if our assumptions are violated.

We can track DMA that got dirtied, but it doesn't mean that said DMA is mapped.
I don't think VFIO ties those two in? Like you can ask to track certain ranges,
but if it's in IOMMU then device gets target abort. Start dirty tracking,
doesn't imply that you allow such DMA

with vIOMMU it's just anything that falls outside the IOMMU mapped ranges (or
identity map) get always marked as dirty if it wasn't armed in the device dirty
tracker. It's a best effort basis -- as I don't think supporting vIOMMU has a
ton of options without a more significant compromise. If the vIOMMU is in
passthrough mode, then things work just as if no-vIOMMU is there. Avihai's code
reflects that.

Considering your earlier suggestion that we only start dirty tracking and record
ranges *when*  dirty tracking start operation happens ... then this gets further
simplified. We also have to take into account that we can't have guarantees that
we can change ranges under tracking to be dynamic.

For improving vIOMMU case we either track the MAX_IOVA or we compose an
artifical range based the max-iova of current vIOMMU maps.

> We could use a MemoryListener on the IOVA space to record a high level
> mark, but we'd need to continue to monitor that mark while we're in
> pre-copy and I don't think anyone would agree that a migratable VM can
> suddenly become unmigratable due to a random IOVA allocation would be
> supportable.  That leads me to think that a machine option to limit the
> vIOMMU address space, and testing that against the device prior to
> declaring migration support of the device is possibly our best option.
> 
> Is that feasible?  Do all the vIOMMU models have a means to limit the
> IOVA space? 

I can say that *at least* AMD and Intel support that. Intel supports either 39
or 48 address-width modes (only those two values as I understand). AMD
supposedly has a more granular management of VASize and PASize.

I have no idea on smmuv3 or virtio-iommu.

But isn't this is actually what Avihai does in the series, but minus the device
part? The address-width is fetched directly from the vIOMMU model, via the
IOMMU_ATTR_MAX_IOVA, and one of the options is to compose a range based on max
vIOMMU range.

> How does QEMU learn a limit for a given device? 

IOMMU_ATTR_MAX_IOVA for vIOMMU

For device this is not described in ACPI or any place that I know :/ without
getting into VF specifics

> We
> probably need to think about whether there are devices that can even
> support the guest physical memory ranges when we start relocating RAM
> to arbitrary addresses (ex. hypertransport). 

In theory we require one-bit more in device DMA engine. so instead of max 39bits
we require 40bits for a 1T guest. GPUs and modern NICs are 64-bit DMA address
capable devices, but it's a bit hard to learn this as it's device specific.

> Can we infer anything
> from the vCPU virtual address space or is that still an unreasonable
> range to track for devices?  Thanks,
> 
We sort of rely on that for iommu=pt or no-vIOMMU case where vCPU address space
matches that of IOVA space, but that not sure how much you would from vCPU
address space that vIOMMU mapping doesn't give you already
diff mbox series

Patch

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 1f21e1fa43..1dc00cabcd 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -95,6 +95,8 @@  typedef struct VFIOContainer {
     unsigned int dma_max_mappings;
     IOVATree *mappings;
     QemuMutex mappings_mutex;
+    /* Represents the range [0, giommu_tracked_range) not inclusive */
+    hwaddr giommu_tracked_range;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 4a7fff6eeb..1024788bcc 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -45,6 +45,8 @@ 
 #include "migration/qemu-file.h"
 #include "sysemu/tpm.h"
 #include "qemu/iova-tree.h"
+#include "hw/boards.h"
+#include "hw/mem/memory-device.h"
 
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -430,6 +432,38 @@  void vfio_unblock_multiple_devices_migration(void)
     multiple_devices_migration_blocker = NULL;
 }
 
+static uint64_t vfio_get_ram_size(void)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    uint64_t plugged_size;
+
+    plugged_size = get_plugged_memory_size();
+    if (plugged_size == (uint64_t)-1) {
+        plugged_size = 0;
+    }
+
+    return ms->ram_size + plugged_size;
+}
+
+static int vfio_iommu_get_max_iova(VFIOContainer *container, hwaddr *max_iova)
+{
+    VFIOGuestIOMMU *giommu;
+    int ret;
+
+    giommu = QLIST_FIRST(&container->giommu_list);
+    if (!giommu) {
+        return -ENOENT;
+    }
+
+    ret = memory_region_iommu_get_attr(giommu->iommu_mr, IOMMU_ATTR_MAX_IOVA,
+                                       max_iova);
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
 static bool vfio_have_giommu(VFIOContainer *container)
 {
     return !QLIST_EMPTY(&container->giommu_list);
@@ -1510,7 +1544,8 @@  static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data)
 }
 
 static struct vfio_device_feature *
-vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
+vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
+                                             bool giommu)
 {
     struct vfio_device_feature *feature;
     size_t feature_size;
@@ -1529,6 +1564,16 @@  vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
     control = (struct vfio_device_feature_dma_logging_control *)feature->data;
     control->page_size = qemu_real_host_page_size();
 
+    if (giommu) {
+        ranges = g_malloc0(sizeof(*ranges));
+        ranges->iova = 0;
+        ranges->length = container->giommu_tracked_range;
+        control->num_ranges = 1;
+        control->ranges = (uint64_t)ranges;
+
+        return feature;
+    }
+
     QEMU_LOCK_GUARD(&container->mappings_mutex);
 
     /*
@@ -1578,12 +1623,12 @@  static void vfio_device_feature_dma_logging_start_destroy(
     g_free(feature);
 }
 
-static int vfio_devices_dma_logging_start(VFIOContainer *container)
+static int vfio_devices_dma_logging_start(VFIOContainer *container, bool giommu)
 {
     struct vfio_device_feature *feature;
     int ret;
 
-    feature = vfio_device_feature_dma_logging_start_create(container);
+    feature = vfio_device_feature_dma_logging_start_create(container, giommu);
     if (!feature) {
         return -errno;
     }
@@ -1598,18 +1643,128 @@  static int vfio_devices_dma_logging_start(VFIOContainer *container)
     return ret;
 }
 
+typedef struct {
+    hwaddr *ranges;
+    unsigned int ranges_num;
+} VFIOGIOMMUDeviceDTRanges;
+
+/*
+ * This value is used in the second attempt to start device dirty tracking with
+ * vIOMMU, or if the giommu fails to report its max iova.
+ * It should be in the middle, not too big and not too small, allowing devices
+ * with HW limitations to do device dirty tracking while covering a fair amount
+ * of the IOVA space.
+ *
+ * This arbitrary value was chosen becasue it is the minimum value of Intel
+ * IOMMU max IOVA and mlx5 devices support tracking a range of this size.
+ */
+#define VFIO_IOMMU_DEFAULT_MAX_IOVA ((1ULL << 39) - 1)
+
+#define VFIO_IOMMU_RANGES_NUM 3
+static VFIOGIOMMUDeviceDTRanges *
+vfio_iommu_device_dirty_tracking_ranges_create(VFIOContainer *container)
+{
+    hwaddr iommu_max_iova = VFIO_IOMMU_DEFAULT_MAX_IOVA;
+    hwaddr retry_iova;
+    hwaddr ram_size = vfio_get_ram_size();
+    VFIOGIOMMUDeviceDTRanges *dt_ranges;
+    int ret;
+
+    dt_ranges = g_try_new0(VFIOGIOMMUDeviceDTRanges, 1);
+    if (!dt_ranges) {
+        errno = ENOMEM;
+
+        return NULL;
+    }
+
+    dt_ranges->ranges_num = VFIO_IOMMU_RANGES_NUM;
+
+    dt_ranges->ranges = g_try_new0(hwaddr, dt_ranges->ranges_num);
+    if (!dt_ranges->ranges) {
+        g_free(dt_ranges);
+        errno = ENOMEM;
+
+        return NULL;
+    }
+
+    /*
+     * With vIOMMU we try to track the entire IOVA space. As the IOVA space can
+     * be rather big, devices might not be able to track it due to HW
+     * limitations. In that case:
+     * (1) Retry tracking a smaller part of the IOVA space.
+     * (2) Retry tracking a range in the size of the physical memory.
+     */
+    ret = vfio_iommu_get_max_iova(container, &iommu_max_iova);
+    if (!ret) {
+        /* Check 2^64 wrap around */
+        if (!REAL_HOST_PAGE_ALIGN(iommu_max_iova)) {
+            iommu_max_iova -= qemu_real_host_page_size();
+        }
+    }
+
+    retry_iova = MIN(iommu_max_iova / 2, VFIO_IOMMU_DEFAULT_MAX_IOVA);
+
+    dt_ranges->ranges[0] = REAL_HOST_PAGE_ALIGN(iommu_max_iova);
+    dt_ranges->ranges[1] = REAL_HOST_PAGE_ALIGN(retry_iova);
+    dt_ranges->ranges[2] = REAL_HOST_PAGE_ALIGN(MIN(ram_size, retry_iova / 2));
+
+    return dt_ranges;
+}
+
+static void vfio_iommu_device_dirty_tracking_ranges_destroy(
+    VFIOGIOMMUDeviceDTRanges *dt_ranges)
+{
+    g_free(dt_ranges->ranges);
+    g_free(dt_ranges);
+}
+
+static int vfio_devices_start_dirty_page_tracking(VFIOContainer *container)
+{
+    VFIOGIOMMUDeviceDTRanges *dt_ranges;
+    int ret;
+    int i;
+
+    if (!vfio_have_giommu(container)) {
+        return vfio_devices_dma_logging_start(container, false);
+    }
+
+    dt_ranges = vfio_iommu_device_dirty_tracking_ranges_create(container);
+    if (!dt_ranges) {
+        return -errno;
+    }
+
+    for (i = 0; i < dt_ranges->ranges_num; i++) {
+        container->giommu_tracked_range = dt_ranges->ranges[i];
+        ret = vfio_devices_dma_logging_start(container, true);
+        if (!ret) {
+            break;
+        }
+
+        if (i < dt_ranges->ranges_num - 1) {
+            warn_report("Failed to start device dirty tracking with vIOMMU "
+                        "with range of size 0x%" HWADDR_PRIx
+                        ", err: %d. Retrying with range "
+                        "of size 0x%" HWADDR_PRIx,
+                        dt_ranges->ranges[i], ret, dt_ranges->ranges[i + 1]);
+        } else {
+            error_report("Failed to start device dirty tracking with vIOMMU "
+                         "with range of size 0x%" HWADDR_PRIx ", err: %d",
+                         dt_ranges->ranges[i], ret);
+        }
+    }
+
+    vfio_iommu_device_dirty_tracking_ranges_destroy(dt_ranges);
+
+    return ret;
+}
+
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
     int ret;
 
     if (vfio_devices_all_device_dirty_tracking(container)) {
-        if (vfio_have_giommu(container)) {
-            /* Device dirty page tracking currently doesn't support vIOMMU */
-            return;
-        }
-
-        ret = vfio_devices_dma_logging_start(container);
+        ret = vfio_devices_start_dirty_page_tracking(container);
     } else {
         ret = vfio_set_dirty_page_tracking(container, true);
     }
@@ -1627,11 +1782,6 @@  static void vfio_listener_log_global_stop(MemoryListener *listener)
     int ret;
 
     if (vfio_devices_all_device_dirty_tracking(container)) {
-        if (vfio_have_giommu(container)) {
-            /* Device dirty page tracking currently doesn't support vIOMMU */
-            return;
-        }
-
         ret = vfio_devices_dma_logging_stop(container);
     } else {
         ret = vfio_set_dirty_page_tracking(container, false);
@@ -1670,6 +1820,17 @@  static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
     return 0;
 }
 
+static bool vfio_iommu_range_is_device_tracked(VFIOContainer *container,
+                                               hwaddr iova, hwaddr size)
+{
+    /* Check for 2^64 wrap around */
+    if (!(iova + size)) {
+        return false;
+    }
+
+    return iova + size <= container->giommu_tracked_range;
+}
+
 static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
                                            VFIOBitmap *vbmap, hwaddr iova,
                                            hwaddr size)
@@ -1679,10 +1840,11 @@  static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
     int ret;
 
     if (vfio_have_giommu(container)) {
-        /* Device dirty page tracking currently doesn't support vIOMMU */
-        bitmap_set(vbmap->bitmap, 0, vbmap->pages);
+        if (!vfio_iommu_range_is_device_tracked(container, iova, size)) {
+            bitmap_set(vbmap->bitmap, 0, vbmap->pages);
 
-        return 0;
+            return 0;
+        }
     }
 
     QLIST_FOREACH(group, &container->group_list, container_next) {