diff mbox series

[v3,05/10] vfio: Support for RamDiscardMgr in the !vIOMMU case

Message ID 20201216141200.118742-6-david@redhat.com
State New
Headers show
Series virtio-mem: vfio support | expand

Commit Message

David Hildenbrand Dec. 16, 2020, 2:11 p.m. UTC
Implement support for RamDiscardMgr, to prepare for virtio-mem
support. Instead of mapping the whole memory section, we only map
"populated" parts and update the mapping when notified about
discarding/population of memory via the RamDiscardListener. Similarly, when
syncing the dirty bitmaps, sync only the actually mapped (populated) parts
by replaying via the notifier.

Small mapping granularity is problematic for vfio, because we might run out
of mappings. Indicate virito-mem as one of the problematic parts when
warning in vfio_container_dma_reserve() to at least make users aware that
there is such a limitation.

Using virtio-mem with vfio is still blocked via
ram_block_discard_disable()/ram_block_discard_require() after this patch.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 hw/vfio/common.c              | 213 +++++++++++++++++++++++++++++++++-
 include/hw/vfio/vfio-common.h |  13 +++
 2 files changed, 225 insertions(+), 1 deletion(-)

Comments

Alex Williamson Dec. 17, 2020, 6:36 p.m. UTC | #1
On Wed, 16 Dec 2020 15:11:55 +0100
David Hildenbrand <david@redhat.com> wrote:

> Implement support for RamDiscardMgr, to prepare for virtio-mem
> support. Instead of mapping the whole memory section, we only map
> "populated" parts and update the mapping when notified about
> discarding/population of memory via the RamDiscardListener. Similarly, when
> syncing the dirty bitmaps, sync only the actually mapped (populated) parts
> by replaying via the notifier.
> 
> Small mapping granularity is problematic for vfio, because we might run out
> of mappings. Indicate virito-mem as one of the problematic parts when
> warning in vfio_container_dma_reserve() to at least make users aware that
> there is such a limitation.
> 
> Using virtio-mem with vfio is still blocked via
> ram_block_discard_disable()/ram_block_discard_require() after this patch.
> 
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Cc: Alex Williamson <alex.williamson@redhat.com>
> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Cc: Igor Mammedov <imammedo@redhat.com>
> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
> Cc: Peter Xu <peterx@redhat.com>
> Cc: Auger Eric <eric.auger@redhat.com>
> Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
> Cc: teawater <teawaterz@linux.alibaba.com>
> Cc: Marek Kedzierski <mkedzier@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  hw/vfio/common.c              | 213 +++++++++++++++++++++++++++++++++-
>  include/hw/vfio/vfio-common.h |  13 +++
>  2 files changed, 225 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 5ad88d476f..b1582be1e8 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -296,7 +296,8 @@ static void vfio_container_dma_reserve(VFIOContainer *container,
>      container->dma_reserved += dma_mappings;
>      if (!warned && container->dma_max &&
>          container->dma_reserved > container->dma_max) {
> -        warn_report("%s: possibly running out of DMA mappings. "
> +        warn_report("%s: possibly running out of DMA mappings. E.g., try"
> +                    " increasing the 'block-size' of virtio-mem devies."
>                      " Maximum number of DMA mappings: %d", __func__,
>                      container->dma_max);
>      }
> @@ -674,6 +675,146 @@ out:
>      rcu_read_unlock();
>  }
>  
> +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
> +                                            const MemoryRegion *mr,
> +                                            ram_addr_t offset, ram_addr_t size)
> +{
> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
> +                                                listener);
> +    const hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
> +    const hwaddr mr_end = MIN(offset + size,
> +                              vrdl->offset_within_region + vrdl->size);
> +    const hwaddr iova = mr_start - vrdl->offset_within_region +
> +                        vrdl->offset_within_address_space;
> +    int ret;
> +
> +    if (mr_start >= mr_end) {
> +        return;
> +    }
> +
> +    /* Unmap with a single call. */
> +    ret = vfio_dma_unmap(vrdl->container, iova, mr_end - mr_start, NULL);
> +    if (ret) {
> +        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
> +                     strerror(-ret));
> +    }
> +}
> +
> +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
> +                                            const MemoryRegion *mr,
> +                                            ram_addr_t offset, ram_addr_t size)
> +{
> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
> +                                                listener);
> +    const hwaddr mr_end = MIN(offset + size,
> +                              vrdl->offset_within_region + vrdl->size);
> +    hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
> +    hwaddr mr_next, iova;
> +    void *vaddr;
> +    int ret;
> +
> +    /*
> +     * Map in (aligned within memory region) minimum granularity, so we can
> +     * unmap in minimum granularity later.
> +     */
> +    for (; mr_start < mr_end; mr_start = mr_next) {
> +        mr_next = QEMU_ALIGN_UP(mr_start + 1, vrdl->granularity);
> +        mr_next = MIN(mr_next, mr_end);
> +
> +        iova = mr_start - vrdl->offset_within_region +
> +               vrdl->offset_within_address_space;
> +        vaddr = memory_region_get_ram_ptr(vrdl->mr) + mr_start;
> +
> +        ret = vfio_dma_map(vrdl->container, iova, mr_next - mr_start,
> +                           vaddr, mr->readonly);
> +        if (ret) {
> +            /* Rollback */
> +            vfio_ram_discard_notify_discard(rdl, mr, offset, size);
> +            return ret;
> +        }
> +    }
> +    return 0;
> +}
> +
> +static void vfio_ram_discard_notify_discard_all(RamDiscardListener *rdl,
> +                                                const MemoryRegion *mr)
> +{
> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
> +                                                listener);
> +    int ret;
> +
> +    /* Unmap with a single call. */
> +    ret = vfio_dma_unmap(vrdl->container, vrdl->offset_within_address_space,
> +                         vrdl->size, NULL);
> +    if (ret) {
> +        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
> +                     strerror(-ret));
> +    }
> +}
> +
> +static void vfio_register_ram_discard_notifier(VFIOContainer *container,
> +                                               MemoryRegionSection *section)
> +{
> +    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
> +    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
> +    VFIORamDiscardListener *vrdl;
> +
> +    vrdl = g_new0(VFIORamDiscardListener, 1);
> +    vrdl->container = container;
> +    vrdl->mr = section->mr;
> +    vrdl->offset_within_region = section->offset_within_region;
> +    vrdl->offset_within_address_space = section->offset_within_address_space;
> +    vrdl->size = int128_get64(section->size);
> +    vrdl->granularity = rdmc->get_min_granularity(rdm, section->mr);
> +    vrdl->dma_max = vrdl->size / vrdl->granularity;
> +    if (!QEMU_IS_ALIGNED(vrdl->size, vrdl->granularity) ||
> +        !QEMU_IS_ALIGNED(vrdl->offset_within_region, vrdl->granularity)) {
> +        vrdl->dma_max++;
> +    }
> +
> +    /* Ignore some corner cases not relevant in practice. */
> +    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_region, TARGET_PAGE_SIZE));
> +    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_address_space,
> +                             TARGET_PAGE_SIZE));
> +    g_assert(QEMU_IS_ALIGNED(vrdl->size, TARGET_PAGE_SIZE));
> +
> +    /* We could consume quite some mappings later. */
> +    vfio_container_dma_reserve(container, vrdl->dma_max);


Aha, I guess this is where the "reservation" aspect begins to appear.
Should this be its own counter though, perhaps
dma_discard_max_mappings?  The populate and discard callbacks could
further divide this into used and outstanding counters.  However, TBH
I'm not sure I understand the counters since this is probably the most
robust mapping path where we can actually safely nak a populate
callback.  Maybe rather than any of these runtime counters we should
just walk the vrdl_list, calculate max mappings, and if that exceeds
some large fraction of available mappings, issue a warning (not that
they wouldn't be useful for tracing).  Thanks,

Alex

> +
> +    ram_discard_listener_init(&vrdl->listener,
> +                              vfio_ram_discard_notify_populate,
> +                              vfio_ram_discard_notify_discard,
> +                              vfio_ram_discard_notify_discard_all);
> +    rdmc->register_listener(rdm, section->mr, &vrdl->listener);
> +    QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
> +}
> +
> +static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
> +                                                 MemoryRegionSection *section)
> +{
> +    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
> +    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
> +    VFIORamDiscardListener *vrdl = NULL;
> +
> +    QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
> +        if (vrdl->mr == section->mr &&
> +            vrdl->offset_within_region == section->offset_within_region) {
> +            break;
> +        }
> +    }
> +
> +    if (!vrdl) {
> +        hw_error("vfio: Trying to unregister missing RAM discard listener");
> +    }
> +
> +    rdmc->unregister_listener(rdm, section->mr, &vrdl->listener);
> +    QLIST_REMOVE(vrdl, next);
> +
> +    vfio_container_dma_unreserve(container, vrdl->dma_max);
> +
> +    g_free(vrdl);
> +}
> +
>  static void vfio_listener_region_add(MemoryListener *listener,
>                                       MemoryRegionSection *section)
>  {
> @@ -834,6 +975,16 @@ static void vfio_listener_region_add(MemoryListener *listener,
>  
>      /* Here we assume that memory_region_is_ram(section->mr)==true */
>  
> +    /*
> +     * For RAM memory regions with a RamDiscardMgr, we only want to
> +     * register the actually "used" parts - and update the mapping whenever
> +     * we're notified about changes.
> +     */
> +    if (memory_region_has_ram_discard_mgr(section->mr)) {
> +        vfio_register_ram_discard_notifier(container, section);
> +        return;
> +    }
> +
>      vaddr = memory_region_get_ram_ptr(section->mr) +
>              section->offset_within_region +
>              (iova - section->offset_within_address_space);
> @@ -975,6 +1126,10 @@ static void vfio_listener_region_del(MemoryListener *listener,
>  
>          pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
>          try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
> +    } else if (memory_region_has_ram_discard_mgr(section->mr)) {
> +        vfio_unregister_ram_discard_listener(container, section);
> +        /* Unregistering will trigger an unmap. */
> +        try_unmap = false;
>      }
>  
>      if (try_unmap) {
> @@ -1107,6 +1262,59 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>      rcu_read_unlock();
>  }
>  
> +static int vfio_ram_discard_notify_dirty_bitmap(RamDiscardListener *rdl,
> +                                                const MemoryRegion *mr,
> +                                                ram_addr_t offset,
> +                                                ram_addr_t size)
> +{
> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
> +                                                listener);
> +    const hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
> +    const hwaddr mr_end = MIN(offset + size,
> +                              vrdl->offset_within_region + vrdl->size);
> +    const hwaddr iova = mr_start - vrdl->offset_within_region +
> +                        vrdl->offset_within_address_space;
> +    ram_addr_t ram_addr;
> +    int ret;
> +
> +    if (mr_start >= mr_end) {
> +        return 0;
> +    }
> +
> +    /*
> +     * Sync the whole mapped region (spanning multiple individual mappings)
> +     * in one go.
> +     */
> +    ram_addr = memory_region_get_ram_addr(vrdl->mr) + mr_start;
> +    ret = vfio_get_dirty_bitmap(vrdl->container, iova, mr_end - mr_start,
> +                                ram_addr);
> +    return ret;
> +}
> +
> +static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
> +                                                   MemoryRegionSection *section)
> +{
> +    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
> +    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
> +    VFIORamDiscardListener tmp_vrdl, *vrdl = NULL;
> +
> +    QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
> +        if (vrdl->mr == section->mr &&
> +            vrdl->offset_within_region == section->offset_within_region) {
> +            break;
> +        }
> +    }
> +
> +    if (!vrdl) {
> +        hw_error("vfio: Trying to sync missing RAM discard listener");
> +    }
> +
> +    tmp_vrdl = *vrdl;
> +    ram_discard_listener_init(&tmp_vrdl.listener,
> +                              vfio_ram_discard_notify_dirty_bitmap, NULL, NULL);
> +    return rdmc->replay_populated(rdm, section->mr, &tmp_vrdl.listener);
> +}
> +
>  static int vfio_sync_dirty_bitmap(VFIOContainer *container,
>                                    MemoryRegionSection *section)
>  {
> @@ -1138,6 +1346,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container,
>              }
>          }
>          return 0;
> +    } else if (memory_region_has_ram_discard_mgr(section->mr)) {
> +        return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
>      }
>  
>      ram_addr = memory_region_get_ram_addr(section->mr) +
> @@ -1768,6 +1978,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
>      container->dma_max = 0;
>      QLIST_INIT(&container->giommu_list);
>      QLIST_INIT(&container->hostwin_list);
> +    QLIST_INIT(&container->vrdl_list);
>  
>      ret = vfio_init_container(container, group->fd, errp);
>      if (ret) {
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index fed0e85f66..fba5a14c8b 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -93,6 +93,7 @@ typedef struct VFIOContainer {
>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>      QLIST_HEAD(, VFIOGroup) group_list;
> +    QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
>      QLIST_ENTRY(VFIOContainer) next;
>  } VFIOContainer;
>  
> @@ -104,6 +105,18 @@ typedef struct VFIOGuestIOMMU {
>      QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
>  } VFIOGuestIOMMU;
>  
> +typedef struct VFIORamDiscardListener {
> +    VFIOContainer *container;
> +    MemoryRegion *mr;
> +    hwaddr offset_within_region;
> +    hwaddr offset_within_address_space;
> +    hwaddr size;
> +    uint64_t granularity;
> +    unsigned long dma_max;
> +    RamDiscardListener listener;
> +    QLIST_ENTRY(VFIORamDiscardListener) next;
> +} VFIORamDiscardListener;
> +
>  typedef struct VFIOHostDMAWindow {
>      hwaddr min_iova;
>      hwaddr max_iova;
David Hildenbrand Dec. 17, 2020, 6:55 p.m. UTC | #2
On 17.12.20 19:36, Alex Williamson wrote:
> On Wed, 16 Dec 2020 15:11:55 +0100
> David Hildenbrand <david@redhat.com> wrote:
> 
>> Implement support for RamDiscardMgr, to prepare for virtio-mem
>> support. Instead of mapping the whole memory section, we only map
>> "populated" parts and update the mapping when notified about
>> discarding/population of memory via the RamDiscardListener. Similarly, when
>> syncing the dirty bitmaps, sync only the actually mapped (populated) parts
>> by replaying via the notifier.
>>
>> Small mapping granularity is problematic for vfio, because we might run out
>> of mappings. Indicate virito-mem as one of the problematic parts when
>> warning in vfio_container_dma_reserve() to at least make users aware that
>> there is such a limitation.
>>
>> Using virtio-mem with vfio is still blocked via
>> ram_block_discard_disable()/ram_block_discard_require() after this patch.
>>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>> Cc: Alex Williamson <alex.williamson@redhat.com>
>> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
>> Cc: Igor Mammedov <imammedo@redhat.com>
>> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
>> Cc: Peter Xu <peterx@redhat.com>
>> Cc: Auger Eric <eric.auger@redhat.com>
>> Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
>> Cc: teawater <teawaterz@linux.alibaba.com>
>> Cc: Marek Kedzierski <mkedzier@redhat.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>  hw/vfio/common.c              | 213 +++++++++++++++++++++++++++++++++-
>>  include/hw/vfio/vfio-common.h |  13 +++
>>  2 files changed, 225 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 5ad88d476f..b1582be1e8 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -296,7 +296,8 @@ static void vfio_container_dma_reserve(VFIOContainer *container,
>>      container->dma_reserved += dma_mappings;
>>      if (!warned && container->dma_max &&
>>          container->dma_reserved > container->dma_max) {
>> -        warn_report("%s: possibly running out of DMA mappings. "
>> +        warn_report("%s: possibly running out of DMA mappings. E.g., try"
>> +                    " increasing the 'block-size' of virtio-mem devies."
>>                      " Maximum number of DMA mappings: %d", __func__,
>>                      container->dma_max);
>>      }
>> @@ -674,6 +675,146 @@ out:
>>      rcu_read_unlock();
>>  }
>>  
>> +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
>> +                                            const MemoryRegion *mr,
>> +                                            ram_addr_t offset, ram_addr_t size)
>> +{
>> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
>> +                                                listener);
>> +    const hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
>> +    const hwaddr mr_end = MIN(offset + size,
>> +                              vrdl->offset_within_region + vrdl->size);
>> +    const hwaddr iova = mr_start - vrdl->offset_within_region +
>> +                        vrdl->offset_within_address_space;
>> +    int ret;
>> +
>> +    if (mr_start >= mr_end) {
>> +        return;
>> +    }
>> +
>> +    /* Unmap with a single call. */
>> +    ret = vfio_dma_unmap(vrdl->container, iova, mr_end - mr_start, NULL);
>> +    if (ret) {
>> +        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
>> +                     strerror(-ret));
>> +    }
>> +}
>> +
>> +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
>> +                                            const MemoryRegion *mr,
>> +                                            ram_addr_t offset, ram_addr_t size)
>> +{
>> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
>> +                                                listener);
>> +    const hwaddr mr_end = MIN(offset + size,
>> +                              vrdl->offset_within_region + vrdl->size);
>> +    hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
>> +    hwaddr mr_next, iova;
>> +    void *vaddr;
>> +    int ret;
>> +
>> +    /*
>> +     * Map in (aligned within memory region) minimum granularity, so we can
>> +     * unmap in minimum granularity later.
>> +     */
>> +    for (; mr_start < mr_end; mr_start = mr_next) {
>> +        mr_next = QEMU_ALIGN_UP(mr_start + 1, vrdl->granularity);
>> +        mr_next = MIN(mr_next, mr_end);
>> +
>> +        iova = mr_start - vrdl->offset_within_region +
>> +               vrdl->offset_within_address_space;
>> +        vaddr = memory_region_get_ram_ptr(vrdl->mr) + mr_start;
>> +
>> +        ret = vfio_dma_map(vrdl->container, iova, mr_next - mr_start,
>> +                           vaddr, mr->readonly);
>> +        if (ret) {
>> +            /* Rollback */
>> +            vfio_ram_discard_notify_discard(rdl, mr, offset, size);
>> +            return ret;
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>> +static void vfio_ram_discard_notify_discard_all(RamDiscardListener *rdl,
>> +                                                const MemoryRegion *mr)
>> +{
>> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
>> +                                                listener);
>> +    int ret;
>> +
>> +    /* Unmap with a single call. */
>> +    ret = vfio_dma_unmap(vrdl->container, vrdl->offset_within_address_space,
>> +                         vrdl->size, NULL);
>> +    if (ret) {
>> +        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
>> +                     strerror(-ret));
>> +    }
>> +}
>> +
>> +static void vfio_register_ram_discard_notifier(VFIOContainer *container,
>> +                                               MemoryRegionSection *section)
>> +{
>> +    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
>> +    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
>> +    VFIORamDiscardListener *vrdl;
>> +
>> +    vrdl = g_new0(VFIORamDiscardListener, 1);
>> +    vrdl->container = container;
>> +    vrdl->mr = section->mr;
>> +    vrdl->offset_within_region = section->offset_within_region;
>> +    vrdl->offset_within_address_space = section->offset_within_address_space;
>> +    vrdl->size = int128_get64(section->size);
>> +    vrdl->granularity = rdmc->get_min_granularity(rdm, section->mr);
>> +    vrdl->dma_max = vrdl->size / vrdl->granularity;
>> +    if (!QEMU_IS_ALIGNED(vrdl->size, vrdl->granularity) ||
>> +        !QEMU_IS_ALIGNED(vrdl->offset_within_region, vrdl->granularity)) {
>> +        vrdl->dma_max++;
>> +    }
>> +
>> +    /* Ignore some corner cases not relevant in practice. */
>> +    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_region, TARGET_PAGE_SIZE));
>> +    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_address_space,
>> +                             TARGET_PAGE_SIZE));
>> +    g_assert(QEMU_IS_ALIGNED(vrdl->size, TARGET_PAGE_SIZE));
>> +
>> +    /* We could consume quite some mappings later. */
>> +    vfio_container_dma_reserve(container, vrdl->dma_max);
> 
> 
> Aha, I guess this is where the "reservation" aspect begins to appear.
> Should this be its own counter though, perhaps
> dma_discard_max_mappings?  The populate and discard callbacks could
> further divide this into used and outstanding counters.  However, TBH
> I'm not sure I understand the counters since this is probably the most
> robust mapping path where we can actually safely nak a populate

I'd like to be able to warn early on fundamental setup issues, not only
when accidentally running into these limits later.

> callback.  Maybe rather than any of these runtime counters we should
> just walk the vrdl_list, calculate max mappings, and if that exceeds
> some large fraction of available mappings, issue a warning (not that
> they wouldn't be useful for tracing).  Thanks,

Sure, we can calculate max mappings from the vrdl_list. But which
fraction to chose? The reservation approach simply considers any
mappings (well, except IOMMU because they are kind of special)

Guidance on the fraction / #mappings to assume we can use appreciated.

Thanks!

> 
> Alex
Alex Williamson Dec. 17, 2020, 7:59 p.m. UTC | #3
On Thu, 17 Dec 2020 19:55:55 +0100
David Hildenbrand <david@redhat.com> wrote:

> On 17.12.20 19:36, Alex Williamson wrote:
> > On Wed, 16 Dec 2020 15:11:55 +0100
> > David Hildenbrand <david@redhat.com> wrote:
> >   
> >> Implement support for RamDiscardMgr, to prepare for virtio-mem
> >> support. Instead of mapping the whole memory section, we only map
> >> "populated" parts and update the mapping when notified about
> >> discarding/population of memory via the RamDiscardListener. Similarly, when
> >> syncing the dirty bitmaps, sync only the actually mapped (populated) parts
> >> by replaying via the notifier.
> >>
> >> Small mapping granularity is problematic for vfio, because we might run out
> >> of mappings. Indicate virito-mem as one of the problematic parts when
> >> warning in vfio_container_dma_reserve() to at least make users aware that
> >> there is such a limitation.
> >>
> >> Using virtio-mem with vfio is still blocked via
> >> ram_block_discard_disable()/ram_block_discard_require() after this patch.
> >>
> >> Cc: Paolo Bonzini <pbonzini@redhat.com>
> >> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> >> Cc: Alex Williamson <alex.williamson@redhat.com>
> >> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >> Cc: Igor Mammedov <imammedo@redhat.com>
> >> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
> >> Cc: Peter Xu <peterx@redhat.com>
> >> Cc: Auger Eric <eric.auger@redhat.com>
> >> Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
> >> Cc: teawater <teawaterz@linux.alibaba.com>
> >> Cc: Marek Kedzierski <mkedzier@redhat.com>
> >> Signed-off-by: David Hildenbrand <david@redhat.com>
> >> ---
> >>  hw/vfio/common.c              | 213 +++++++++++++++++++++++++++++++++-
> >>  include/hw/vfio/vfio-common.h |  13 +++
> >>  2 files changed, 225 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >> index 5ad88d476f..b1582be1e8 100644
> >> --- a/hw/vfio/common.c
> >> +++ b/hw/vfio/common.c
> >> @@ -296,7 +296,8 @@ static void vfio_container_dma_reserve(VFIOContainer *container,
> >>      container->dma_reserved += dma_mappings;
> >>      if (!warned && container->dma_max &&
> >>          container->dma_reserved > container->dma_max) {
> >> -        warn_report("%s: possibly running out of DMA mappings. "
> >> +        warn_report("%s: possibly running out of DMA mappings. E.g., try"
> >> +                    " increasing the 'block-size' of virtio-mem devies."
> >>                      " Maximum number of DMA mappings: %d", __func__,
> >>                      container->dma_max);
> >>      }
> >> @@ -674,6 +675,146 @@ out:
> >>      rcu_read_unlock();
> >>  }
> >>  
> >> +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
> >> +                                            const MemoryRegion *mr,
> >> +                                            ram_addr_t offset, ram_addr_t size)
> >> +{
> >> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
> >> +                                                listener);
> >> +    const hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
> >> +    const hwaddr mr_end = MIN(offset + size,
> >> +                              vrdl->offset_within_region + vrdl->size);
> >> +    const hwaddr iova = mr_start - vrdl->offset_within_region +
> >> +                        vrdl->offset_within_address_space;
> >> +    int ret;
> >> +
> >> +    if (mr_start >= mr_end) {
> >> +        return;
> >> +    }
> >> +
> >> +    /* Unmap with a single call. */
> >> +    ret = vfio_dma_unmap(vrdl->container, iova, mr_end - mr_start, NULL);
> >> +    if (ret) {
> >> +        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
> >> +                     strerror(-ret));
> >> +    }
> >> +}
> >> +
> >> +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
> >> +                                            const MemoryRegion *mr,
> >> +                                            ram_addr_t offset, ram_addr_t size)
> >> +{
> >> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
> >> +                                                listener);
> >> +    const hwaddr mr_end = MIN(offset + size,
> >> +                              vrdl->offset_within_region + vrdl->size);
> >> +    hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
> >> +    hwaddr mr_next, iova;
> >> +    void *vaddr;
> >> +    int ret;
> >> +
> >> +    /*
> >> +     * Map in (aligned within memory region) minimum granularity, so we can
> >> +     * unmap in minimum granularity later.
> >> +     */
> >> +    for (; mr_start < mr_end; mr_start = mr_next) {
> >> +        mr_next = QEMU_ALIGN_UP(mr_start + 1, vrdl->granularity);
> >> +        mr_next = MIN(mr_next, mr_end);
> >> +
> >> +        iova = mr_start - vrdl->offset_within_region +
> >> +               vrdl->offset_within_address_space;
> >> +        vaddr = memory_region_get_ram_ptr(vrdl->mr) + mr_start;
> >> +
> >> +        ret = vfio_dma_map(vrdl->container, iova, mr_next - mr_start,
> >> +                           vaddr, mr->readonly);
> >> +        if (ret) {
> >> +            /* Rollback */
> >> +            vfio_ram_discard_notify_discard(rdl, mr, offset, size);
> >> +            return ret;
> >> +        }
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >> +static void vfio_ram_discard_notify_discard_all(RamDiscardListener *rdl,
> >> +                                                const MemoryRegion *mr)
> >> +{
> >> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
> >> +                                                listener);
> >> +    int ret;
> >> +
> >> +    /* Unmap with a single call. */
> >> +    ret = vfio_dma_unmap(vrdl->container, vrdl->offset_within_address_space,
> >> +                         vrdl->size, NULL);
> >> +    if (ret) {
> >> +        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
> >> +                     strerror(-ret));
> >> +    }
> >> +}
> >> +
> >> +static void vfio_register_ram_discard_notifier(VFIOContainer *container,
> >> +                                               MemoryRegionSection *section)
> >> +{
> >> +    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
> >> +    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
> >> +    VFIORamDiscardListener *vrdl;
> >> +
> >> +    vrdl = g_new0(VFIORamDiscardListener, 1);
> >> +    vrdl->container = container;
> >> +    vrdl->mr = section->mr;
> >> +    vrdl->offset_within_region = section->offset_within_region;
> >> +    vrdl->offset_within_address_space = section->offset_within_address_space;
> >> +    vrdl->size = int128_get64(section->size);
> >> +    vrdl->granularity = rdmc->get_min_granularity(rdm, section->mr);
> >> +    vrdl->dma_max = vrdl->size / vrdl->granularity;
> >> +    if (!QEMU_IS_ALIGNED(vrdl->size, vrdl->granularity) ||
> >> +        !QEMU_IS_ALIGNED(vrdl->offset_within_region, vrdl->granularity)) {
> >> +        vrdl->dma_max++;
> >> +    }
> >> +
> >> +    /* Ignore some corner cases not relevant in practice. */
> >> +    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_region, TARGET_PAGE_SIZE));
> >> +    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_address_space,
> >> +                             TARGET_PAGE_SIZE));
> >> +    g_assert(QEMU_IS_ALIGNED(vrdl->size, TARGET_PAGE_SIZE));
> >> +
> >> +    /* We could consume quite some mappings later. */
> >> +    vfio_container_dma_reserve(container, vrdl->dma_max);  
> > 
> > 
> > Aha, I guess this is where the "reservation" aspect begins to appear.
> > Should this be its own counter though, perhaps
> > dma_discard_max_mappings?  The populate and discard callbacks could
> > further divide this into used and outstanding counters.  However, TBH
> > I'm not sure I understand the counters since this is probably the most
> > robust mapping path where we can actually safely nak a populate  
> 
> I'd like to be able to warn early on fundamental setup issues, not only
> when accidentally running into these limits later.
> 
> > callback.  Maybe rather than any of these runtime counters we should
> > just walk the vrdl_list, calculate max mappings, and if that exceeds
> > some large fraction of available mappings, issue a warning (not that
> > they wouldn't be useful for tracing).  Thanks,  
> 
> Sure, we can calculate max mappings from the vrdl_list. But which
> fraction to chose? The reservation approach simply considers any
> mappings (well, except IOMMU because they are kind of special)

Right, but we're looking at the address space of a device, which should
be exclusively system memory or an IOMMU range, right?  There are IOMMUs
that don't restrict the device to the IOVA window, but I'm not sure if
we care about those.  If that's true, I'm not sure we need to worry
about the complicated intersection of RamDiscardMgr and vIOMMU both
creating mappings.

> Guidance on the fraction / #mappings to assume we can use appreciated.

Can we use the number of KVM memory slots as a guide?  This is
essentially a mechanism for sub-dividing things that exist in a KVM
memory slot, so it seems like (dma_avail - KVM-memory-slots) should be
greater than the # of possible granules we'd map across all the
RamDiscardMgr regions.  Maybe a good starting point?  Thanks,

Alex
David Hildenbrand Dec. 18, 2020, 9:11 a.m. UTC | #4
On 17.12.20 20:59, Alex Williamson wrote:
> On Thu, 17 Dec 2020 19:55:55 +0100
> David Hildenbrand <david@redhat.com> wrote:
> 
>> On 17.12.20 19:36, Alex Williamson wrote:
>>> On Wed, 16 Dec 2020 15:11:55 +0100
>>> David Hildenbrand <david@redhat.com> wrote:
>>>   
>>>> Implement support for RamDiscardMgr, to prepare for virtio-mem
>>>> support. Instead of mapping the whole memory section, we only map
>>>> "populated" parts and update the mapping when notified about
>>>> discarding/population of memory via the RamDiscardListener. Similarly, when
>>>> syncing the dirty bitmaps, sync only the actually mapped (populated) parts
>>>> by replaying via the notifier.
>>>>
>>>> Small mapping granularity is problematic for vfio, because we might run out
>>>> of mappings. Indicate virito-mem as one of the problematic parts when
>>>> warning in vfio_container_dma_reserve() to at least make users aware that
>>>> there is such a limitation.
>>>>
>>>> Using virtio-mem with vfio is still blocked via
>>>> ram_block_discard_disable()/ram_block_discard_require() after this patch.
>>>>
>>>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>> Cc: Alex Williamson <alex.williamson@redhat.com>
>>>> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>> Cc: Igor Mammedov <imammedo@redhat.com>
>>>> Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
>>>> Cc: Peter Xu <peterx@redhat.com>
>>>> Cc: Auger Eric <eric.auger@redhat.com>
>>>> Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
>>>> Cc: teawater <teawaterz@linux.alibaba.com>
>>>> Cc: Marek Kedzierski <mkedzier@redhat.com>
>>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>>> ---
>>>>  hw/vfio/common.c              | 213 +++++++++++++++++++++++++++++++++-
>>>>  include/hw/vfio/vfio-common.h |  13 +++
>>>>  2 files changed, 225 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>>>> index 5ad88d476f..b1582be1e8 100644
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -296,7 +296,8 @@ static void vfio_container_dma_reserve(VFIOContainer *container,
>>>>      container->dma_reserved += dma_mappings;
>>>>      if (!warned && container->dma_max &&
>>>>          container->dma_reserved > container->dma_max) {
>>>> -        warn_report("%s: possibly running out of DMA mappings. "
>>>> +        warn_report("%s: possibly running out of DMA mappings. E.g., try"
>>>> +                    " increasing the 'block-size' of virtio-mem devies."
>>>>                      " Maximum number of DMA mappings: %d", __func__,
>>>>                      container->dma_max);
>>>>      }
>>>> @@ -674,6 +675,146 @@ out:
>>>>      rcu_read_unlock();
>>>>  }
>>>>  
>>>> +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
>>>> +                                            const MemoryRegion *mr,
>>>> +                                            ram_addr_t offset, ram_addr_t size)
>>>> +{
>>>> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
>>>> +                                                listener);
>>>> +    const hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
>>>> +    const hwaddr mr_end = MIN(offset + size,
>>>> +                              vrdl->offset_within_region + vrdl->size);
>>>> +    const hwaddr iova = mr_start - vrdl->offset_within_region +
>>>> +                        vrdl->offset_within_address_space;
>>>> +    int ret;
>>>> +
>>>> +    if (mr_start >= mr_end) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    /* Unmap with a single call. */
>>>> +    ret = vfio_dma_unmap(vrdl->container, iova, mr_end - mr_start, NULL);
>>>> +    if (ret) {
>>>> +        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
>>>> +                     strerror(-ret));
>>>> +    }
>>>> +}
>>>> +
>>>> +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
>>>> +                                            const MemoryRegion *mr,
>>>> +                                            ram_addr_t offset, ram_addr_t size)
>>>> +{
>>>> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
>>>> +                                                listener);
>>>> +    const hwaddr mr_end = MIN(offset + size,
>>>> +                              vrdl->offset_within_region + vrdl->size);
>>>> +    hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
>>>> +    hwaddr mr_next, iova;
>>>> +    void *vaddr;
>>>> +    int ret;
>>>> +
>>>> +    /*
>>>> +     * Map in (aligned within memory region) minimum granularity, so we can
>>>> +     * unmap in minimum granularity later.
>>>> +     */
>>>> +    for (; mr_start < mr_end; mr_start = mr_next) {
>>>> +        mr_next = QEMU_ALIGN_UP(mr_start + 1, vrdl->granularity);
>>>> +        mr_next = MIN(mr_next, mr_end);
>>>> +
>>>> +        iova = mr_start - vrdl->offset_within_region +
>>>> +               vrdl->offset_within_address_space;
>>>> +        vaddr = memory_region_get_ram_ptr(vrdl->mr) + mr_start;
>>>> +
>>>> +        ret = vfio_dma_map(vrdl->container, iova, mr_next - mr_start,
>>>> +                           vaddr, mr->readonly);
>>>> +        if (ret) {
>>>> +            /* Rollback */
>>>> +            vfio_ram_discard_notify_discard(rdl, mr, offset, size);
>>>> +            return ret;
>>>> +        }
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void vfio_ram_discard_notify_discard_all(RamDiscardListener *rdl,
>>>> +                                                const MemoryRegion *mr)
>>>> +{
>>>> +    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
>>>> +                                                listener);
>>>> +    int ret;
>>>> +
>>>> +    /* Unmap with a single call. */
>>>> +    ret = vfio_dma_unmap(vrdl->container, vrdl->offset_within_address_space,
>>>> +                         vrdl->size, NULL);
>>>> +    if (ret) {
>>>> +        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
>>>> +                     strerror(-ret));
>>>> +    }
>>>> +}
>>>> +
>>>> +static void vfio_register_ram_discard_notifier(VFIOContainer *container,
>>>> +                                               MemoryRegionSection *section)
>>>> +{
>>>> +    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
>>>> +    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
>>>> +    VFIORamDiscardListener *vrdl;
>>>> +
>>>> +    vrdl = g_new0(VFIORamDiscardListener, 1);
>>>> +    vrdl->container = container;
>>>> +    vrdl->mr = section->mr;
>>>> +    vrdl->offset_within_region = section->offset_within_region;
>>>> +    vrdl->offset_within_address_space = section->offset_within_address_space;
>>>> +    vrdl->size = int128_get64(section->size);
>>>> +    vrdl->granularity = rdmc->get_min_granularity(rdm, section->mr);
>>>> +    vrdl->dma_max = vrdl->size / vrdl->granularity;
>>>> +    if (!QEMU_IS_ALIGNED(vrdl->size, vrdl->granularity) ||
>>>> +        !QEMU_IS_ALIGNED(vrdl->offset_within_region, vrdl->granularity)) {
>>>> +        vrdl->dma_max++;
>>>> +    }
>>>> +
>>>> +    /* Ignore some corner cases not relevant in practice. */
>>>> +    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_region, TARGET_PAGE_SIZE));
>>>> +    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_address_space,
>>>> +                             TARGET_PAGE_SIZE));
>>>> +    g_assert(QEMU_IS_ALIGNED(vrdl->size, TARGET_PAGE_SIZE));
>>>> +
>>>> +    /* We could consume quite some mappings later. */
>>>> +    vfio_container_dma_reserve(container, vrdl->dma_max);  
>>>
>>>
>>> Aha, I guess this is where the "reservation" aspect begins to appear.
>>> Should this be its own counter though, perhaps
>>> dma_discard_max_mappings?  The populate and discard callbacks could
>>> further divide this into used and outstanding counters.  However, TBH
>>> I'm not sure I understand the counters since this is probably the most
>>> robust mapping path where we can actually safely nak a populate  
>>
>> I'd like to be able to warn early on fundamental setup issues, not only
>> when accidentally running into these limits later.
>>
>>> callback.  Maybe rather than any of these runtime counters we should
>>> just walk the vrdl_list, calculate max mappings, and if that exceeds
>>> some large fraction of available mappings, issue a warning (not that
>>> they wouldn't be useful for tracing).  Thanks,  
>>
>> Sure, we can calculate max mappings from the vrdl_list. But which
>> fraction to chose? The reservation approach simply considers any
>> mappings (well, except IOMMU because they are kind of special)
> 
> Right, but we're looking at the address space of a device, which should
> be exclusively system memory or an IOMMU range, right?  There are IOMMUs

Yes, that's why I ignored the IOMMU case for now - RamDiscardMgr and
vIOMMUs should be mutually exclusive in an address space.

> that don't restrict the device to the IOVA window, but I'm not sure if
> we care about those.  If that's true, I'm not sure we need to worry
> about the complicated intersection of RamDiscardMgr and vIOMMU both
> creating mappings.

Yes.

> 
>> Guidance on the fraction / #mappings to assume we can use appreciated.
> 
> Can we use the number of KVM memory slots as a guide?  This is
> essentially a mechanism for sub-dividing things that exist in a KVM
> memory slot, so it seems like (dma_avail - KVM-memory-slots) should be
> greater than the # of possible granules we'd map across all the
> RamDiscardMgr regions.  Maybe a good starting point?

Good idea. I stared with "maximum #DIMMs", but then thought "what about
mapping other things, like PCI bars into address space".
#max-KVM-memory-slots should be a good starting point for now.

So, when mapping a new RamDiscardMgr I'll
1. Loop over vrdl_list and calculate the max #mappings
2. Check against "dma_max - #max-KVM-memory-slots" and issue a
RamDiscardMgr specific error message
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 5ad88d476f..b1582be1e8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -296,7 +296,8 @@  static void vfio_container_dma_reserve(VFIOContainer *container,
     container->dma_reserved += dma_mappings;
     if (!warned && container->dma_max &&
         container->dma_reserved > container->dma_max) {
-        warn_report("%s: possibly running out of DMA mappings. "
+        warn_report("%s: possibly running out of DMA mappings. E.g., try"
+                    " increasing the 'block-size' of virtio-mem devies."
                     " Maximum number of DMA mappings: %d", __func__,
                     container->dma_max);
     }
@@ -674,6 +675,146 @@  out:
     rcu_read_unlock();
 }
 
+static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
+                                            const MemoryRegion *mr,
+                                            ram_addr_t offset, ram_addr_t size)
+{
+    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
+                                                listener);
+    const hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
+    const hwaddr mr_end = MIN(offset + size,
+                              vrdl->offset_within_region + vrdl->size);
+    const hwaddr iova = mr_start - vrdl->offset_within_region +
+                        vrdl->offset_within_address_space;
+    int ret;
+
+    if (mr_start >= mr_end) {
+        return;
+    }
+
+    /* Unmap with a single call. */
+    ret = vfio_dma_unmap(vrdl->container, iova, mr_end - mr_start, NULL);
+    if (ret) {
+        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
+                     strerror(-ret));
+    }
+}
+
+static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
+                                            const MemoryRegion *mr,
+                                            ram_addr_t offset, ram_addr_t size)
+{
+    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
+                                                listener);
+    const hwaddr mr_end = MIN(offset + size,
+                              vrdl->offset_within_region + vrdl->size);
+    hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
+    hwaddr mr_next, iova;
+    void *vaddr;
+    int ret;
+
+    /*
+     * Map in (aligned within memory region) minimum granularity, so we can
+     * unmap in minimum granularity later.
+     */
+    for (; mr_start < mr_end; mr_start = mr_next) {
+        mr_next = QEMU_ALIGN_UP(mr_start + 1, vrdl->granularity);
+        mr_next = MIN(mr_next, mr_end);
+
+        iova = mr_start - vrdl->offset_within_region +
+               vrdl->offset_within_address_space;
+        vaddr = memory_region_get_ram_ptr(vrdl->mr) + mr_start;
+
+        ret = vfio_dma_map(vrdl->container, iova, mr_next - mr_start,
+                           vaddr, mr->readonly);
+        if (ret) {
+            /* Rollback */
+            vfio_ram_discard_notify_discard(rdl, mr, offset, size);
+            return ret;
+        }
+    }
+    return 0;
+}
+
+static void vfio_ram_discard_notify_discard_all(RamDiscardListener *rdl,
+                                                const MemoryRegion *mr)
+{
+    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
+                                                listener);
+    int ret;
+
+    /* Unmap with a single call. */
+    ret = vfio_dma_unmap(vrdl->container, vrdl->offset_within_address_space,
+                         vrdl->size, NULL);
+    if (ret) {
+        error_report("%s: vfio_dma_unmap() failed: %s", __func__,
+                     strerror(-ret));
+    }
+}
+
+static void vfio_register_ram_discard_notifier(VFIOContainer *container,
+                                               MemoryRegionSection *section)
+{
+    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
+    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
+    VFIORamDiscardListener *vrdl;
+
+    vrdl = g_new0(VFIORamDiscardListener, 1);
+    vrdl->container = container;
+    vrdl->mr = section->mr;
+    vrdl->offset_within_region = section->offset_within_region;
+    vrdl->offset_within_address_space = section->offset_within_address_space;
+    vrdl->size = int128_get64(section->size);
+    vrdl->granularity = rdmc->get_min_granularity(rdm, section->mr);
+    vrdl->dma_max = vrdl->size / vrdl->granularity;
+    if (!QEMU_IS_ALIGNED(vrdl->size, vrdl->granularity) ||
+        !QEMU_IS_ALIGNED(vrdl->offset_within_region, vrdl->granularity)) {
+        vrdl->dma_max++;
+    }
+
+    /* Ignore some corner cases not relevant in practice. */
+    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_region, TARGET_PAGE_SIZE));
+    g_assert(QEMU_IS_ALIGNED(vrdl->offset_within_address_space,
+                             TARGET_PAGE_SIZE));
+    g_assert(QEMU_IS_ALIGNED(vrdl->size, TARGET_PAGE_SIZE));
+
+    /* We could consume quite some mappings later. */
+    vfio_container_dma_reserve(container, vrdl->dma_max);
+
+    ram_discard_listener_init(&vrdl->listener,
+                              vfio_ram_discard_notify_populate,
+                              vfio_ram_discard_notify_discard,
+                              vfio_ram_discard_notify_discard_all);
+    rdmc->register_listener(rdm, section->mr, &vrdl->listener);
+    QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
+}
+
+static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
+                                                 MemoryRegionSection *section)
+{
+    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
+    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
+    VFIORamDiscardListener *vrdl = NULL;
+
+    QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
+        if (vrdl->mr == section->mr &&
+            vrdl->offset_within_region == section->offset_within_region) {
+            break;
+        }
+    }
+
+    if (!vrdl) {
+        hw_error("vfio: Trying to unregister missing RAM discard listener");
+    }
+
+    rdmc->unregister_listener(rdm, section->mr, &vrdl->listener);
+    QLIST_REMOVE(vrdl, next);
+
+    vfio_container_dma_unreserve(container, vrdl->dma_max);
+
+    g_free(vrdl);
+}
+
 static void vfio_listener_region_add(MemoryListener *listener,
                                      MemoryRegionSection *section)
 {
@@ -834,6 +975,16 @@  static void vfio_listener_region_add(MemoryListener *listener,
 
     /* Here we assume that memory_region_is_ram(section->mr)==true */
 
+    /*
+     * For RAM memory regions with a RamDiscardMgr, we only want to
+     * register the actually "used" parts - and update the mapping whenever
+     * we're notified about changes.
+     */
+    if (memory_region_has_ram_discard_mgr(section->mr)) {
+        vfio_register_ram_discard_notifier(container, section);
+        return;
+    }
+
     vaddr = memory_region_get_ram_ptr(section->mr) +
             section->offset_within_region +
             (iova - section->offset_within_address_space);
@@ -975,6 +1126,10 @@  static void vfio_listener_region_del(MemoryListener *listener,
 
         pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
         try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
+    } else if (memory_region_has_ram_discard_mgr(section->mr)) {
+        vfio_unregister_ram_discard_listener(container, section);
+        /* Unregistering will trigger an unmap. */
+        try_unmap = false;
     }
 
     if (try_unmap) {
@@ -1107,6 +1262,59 @@  static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
     rcu_read_unlock();
 }
 
+static int vfio_ram_discard_notify_dirty_bitmap(RamDiscardListener *rdl,
+                                                const MemoryRegion *mr,
+                                                ram_addr_t offset,
+                                                ram_addr_t size)
+{
+    VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
+                                                listener);
+    const hwaddr mr_start = MAX(offset, vrdl->offset_within_region);
+    const hwaddr mr_end = MIN(offset + size,
+                              vrdl->offset_within_region + vrdl->size);
+    const hwaddr iova = mr_start - vrdl->offset_within_region +
+                        vrdl->offset_within_address_space;
+    ram_addr_t ram_addr;
+    int ret;
+
+    if (mr_start >= mr_end) {
+        return 0;
+    }
+
+    /*
+     * Sync the whole mapped region (spanning multiple individual mappings)
+     * in one go.
+     */
+    ram_addr = memory_region_get_ram_addr(vrdl->mr) + mr_start;
+    ret = vfio_get_dirty_bitmap(vrdl->container, iova, mr_end - mr_start,
+                                ram_addr);
+    return ret;
+}
+
+static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
+                                                   MemoryRegionSection *section)
+{
+    RamDiscardMgr *rdm = memory_region_get_ram_discard_mgr(section->mr);
+    RamDiscardMgrClass *rdmc = RAM_DISCARD_MGR_GET_CLASS(rdm);
+    VFIORamDiscardListener tmp_vrdl, *vrdl = NULL;
+
+    QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
+        if (vrdl->mr == section->mr &&
+            vrdl->offset_within_region == section->offset_within_region) {
+            break;
+        }
+    }
+
+    if (!vrdl) {
+        hw_error("vfio: Trying to sync missing RAM discard listener");
+    }
+
+    tmp_vrdl = *vrdl;
+    ram_discard_listener_init(&tmp_vrdl.listener,
+                              vfio_ram_discard_notify_dirty_bitmap, NULL, NULL);
+    return rdmc->replay_populated(rdm, section->mr, &tmp_vrdl.listener);
+}
+
 static int vfio_sync_dirty_bitmap(VFIOContainer *container,
                                   MemoryRegionSection *section)
 {
@@ -1138,6 +1346,8 @@  static int vfio_sync_dirty_bitmap(VFIOContainer *container,
             }
         }
         return 0;
+    } else if (memory_region_has_ram_discard_mgr(section->mr)) {
+        return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
     }
 
     ram_addr = memory_region_get_ram_addr(section->mr) +
@@ -1768,6 +1978,7 @@  static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
     container->dma_max = 0;
     QLIST_INIT(&container->giommu_list);
     QLIST_INIT(&container->hostwin_list);
+    QLIST_INIT(&container->vrdl_list);
 
     ret = vfio_init_container(container, group->fd, errp);
     if (ret) {
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index fed0e85f66..fba5a14c8b 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -93,6 +93,7 @@  typedef struct VFIOContainer {
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;
+    QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
     QLIST_ENTRY(VFIOContainer) next;
 } VFIOContainer;
 
@@ -104,6 +105,18 @@  typedef struct VFIOGuestIOMMU {
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 } VFIOGuestIOMMU;
 
+typedef struct VFIORamDiscardListener {
+    VFIOContainer *container;
+    MemoryRegion *mr;
+    hwaddr offset_within_region;
+    hwaddr offset_within_address_space;
+    hwaddr size;
+    uint64_t granularity;
+    unsigned long dma_max;
+    RamDiscardListener listener;
+    QLIST_ENTRY(VFIORamDiscardListener) next;
+} VFIORamDiscardListener;
+
 typedef struct VFIOHostDMAWindow {
     hwaddr min_iova;
     hwaddr max_iova;