diff mbox series

[v3,07/13] vfio/common: Record DMA mapped IOVA ranges

Message ID 20230304014343.33646-8-joao.m.martins@oracle.com
State New
Headers show
Series vfio/migration: Device dirty page tracking | expand

Commit Message

Joao Martins March 4, 2023, 1:43 a.m. UTC
According to the device DMA logging uAPI, IOVA ranges to be logged by
the device must be provided all at once upon DMA logging start.

As preparation for the following patches which will add device dirty
page tracking, keep a record of all DMA mapped IOVA ranges so later they
can be used for DMA logging start.

Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
---
 hw/vfio/common.c              | 84 +++++++++++++++++++++++++++++++++++
 hw/vfio/trace-events          |  1 +
 include/hw/vfio/vfio-common.h | 11 +++++
 3 files changed, 96 insertions(+)

Comments

Cédric Le Goater March 6, 2023, 1:41 p.m. UTC | #1
On 3/4/23 02:43, Joao Martins wrote:
> According to the device DMA logging uAPI, IOVA ranges to be logged by
> the device must be provided all at once upon DMA logging start.
> 
> As preparation for the following patches which will add device dirty
> page tracking, keep a record of all DMA mapped IOVA ranges so later they
> can be used for DMA logging start.
> 
> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
> 
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>   hw/vfio/common.c              | 84 +++++++++++++++++++++++++++++++++++
>   hw/vfio/trace-events          |  1 +
>   include/hw/vfio/vfio-common.h | 11 +++++
>   3 files changed, 96 insertions(+)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ed908e303dbb..d84e5fd86bb4 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -44,6 +44,7 @@
>   #include "migration/blocker.h"
>   #include "migration/qemu-file.h"
>   #include "sysemu/tpm.h"
> +#include "qemu/iova-tree.h"
>   
>   VFIOGroupList vfio_group_list =
>       QLIST_HEAD_INITIALIZER(vfio_group_list);
> @@ -1313,11 +1314,94 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>       return ret;
>   }
>   
> +/*
> + * Called for the dirty tracking memory listener to calculate the iova/end
> + * for a given memory region section. The checks here, replicate the logic
> + * in vfio_listener_region_{add,del}() used for the same purpose. And thus
> + * both listener should be kept in sync.
> + */
> +static bool vfio_get_section_iova_range(VFIOContainer *container,
> +                                        MemoryRegionSection *section,
> +                                        hwaddr *out_iova, hwaddr *out_end)
> +{
> +    Int128 llend;
> +    hwaddr iova;
> +
> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
> +    llend = int128_make64(section->offset_within_address_space);
> +    llend = int128_add(llend, section->size);
> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
> +
> +    if (int128_ge(int128_make64(iova), llend)) {
> +        return false;
> +    }
> +
> +    *out_iova = iova;
> +    *out_end = int128_get64(llend) - 1;
> +    return true;
> +}
> +
> +static void vfio_dirty_tracking_update(MemoryListener *listener,
> +                                       MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            tracking_listener);
> +    VFIODirtyTrackingRange *range = &container->tracking_range;
> +    hwaddr max32 = (1ULL << 32) - 1ULL;
> +    hwaddr iova, end;
> +
> +    if (!vfio_listener_valid_section(section) ||
> +        !vfio_get_section_iova_range(container, section, &iova, &end)) {
> +        return;
> +    }
> +
> +    WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) {
> +        if (iova < max32 && end <= max32) {
> +                if (range->min32 > iova) {
> +                    range->min32 = iova;
> +                }
> +                if (range->max32 < end) {
> +                    range->max32 = end;
> +                }
> +                trace_vfio_device_dirty_tracking_update(iova, end,
> +                                            range->min32, range->max32);
> +        } else {
> +                if (!range->min64 || range->min64 > iova) {
> +                    range->min64 = iova;
> +                }
> +                if (range->max64 < end) {
> +                    range->max64 = end;
> +                }
> +                trace_vfio_device_dirty_tracking_update(iova, end,
> +                                            range->min64, range->max64);
> +        }
> +    }
> +    return;
> +}
> +
> +static const MemoryListener vfio_dirty_tracking_listener = {
> +    .name = "vfio-tracking",
> +    .region_add = vfio_dirty_tracking_update,
> +};
> +
> +static void vfio_dirty_tracking_init(VFIOContainer *container)
> +{
> +    memset(&container->tracking_range, 0, sizeof(container->tracking_range));
> +    qemu_mutex_init(&container->tracking_mutex);
> +    container->tracking_listener = vfio_dirty_tracking_listener;
> +    memory_listener_register(&container->tracking_listener,
> +                             container->space->as);

The following unregister+destroy calls seem to belong to a _fini routine.
Am I missing something ?

Thanks,

C.

> +    memory_listener_unregister(&container->tracking_listener);
> +    qemu_mutex_destroy(&container->tracking_mutex);
> +}
> +
>   static void vfio_listener_log_global_start(MemoryListener *listener)
>   {
>       VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>       int ret;
>   
> +    vfio_dirty_tracking_init(container);
> +
>       ret = vfio_set_dirty_page_tracking(container, true);
>       if (ret) {
>           vfio_set_migration_error(ret);
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 669d9fe07cd9..d97a6de17921 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>   vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>   vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
>   vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
>   vfio_disconnect_container(int fd) "close container->fd=%d"
>   vfio_put_group(int fd) "close group->fd=%d"
>   vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 87524c64a443..96791add2719 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -23,6 +23,7 @@
>   
>   #include "exec/memory.h"
>   #include "qemu/queue.h"
> +#include "qemu/iova-tree.h"
>   #include "qemu/notify.h"
>   #include "ui/console.h"
>   #include "hw/display/ramfb.h"
> @@ -68,6 +69,13 @@ typedef struct VFIOMigration {
>       size_t data_buffer_size;
>   } VFIOMigration;
>   
> +typedef struct VFIODirtyTrackingRange {
> +    hwaddr min32;
> +    hwaddr max32;
> +    hwaddr min64;
> +    hwaddr max64;
> +} VFIODirtyTrackingRange;
> +
>   typedef struct VFIOAddressSpace {
>       AddressSpace *as;
>       QLIST_HEAD(, VFIOContainer) containers;
> @@ -89,6 +97,9 @@ typedef struct VFIOContainer {
>       uint64_t max_dirty_bitmap_size;
>       unsigned long pgsizes;
>       unsigned int dma_max_mappings;
> +    VFIODirtyTrackingRange tracking_range;
> +    QemuMutex tracking_mutex;
> +    MemoryListener tracking_listener;
>       QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>       QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>       QLIST_HEAD(, VFIOGroup) group_list;
Joao Martins March 6, 2023, 2:37 p.m. UTC | #2
On 06/03/2023 13:41, Cédric Le Goater wrote:
> On 3/4/23 02:43, Joao Martins wrote:
>> According to the device DMA logging uAPI, IOVA ranges to be logged by
>> the device must be provided all at once upon DMA logging start.
>>
>> As preparation for the following patches which will add device dirty
>> page tracking, keep a record of all DMA mapped IOVA ranges so later they
>> can be used for DMA logging start.
>>
>> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
>> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
>>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>   hw/vfio/common.c              | 84 +++++++++++++++++++++++++++++++++++
>>   hw/vfio/trace-events          |  1 +
>>   include/hw/vfio/vfio-common.h | 11 +++++
>>   3 files changed, 96 insertions(+)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index ed908e303dbb..d84e5fd86bb4 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -44,6 +44,7 @@
>>   #include "migration/blocker.h"
>>   #include "migration/qemu-file.h"
>>   #include "sysemu/tpm.h"
>> +#include "qemu/iova-tree.h"
>>     VFIOGroupList vfio_group_list =
>>       QLIST_HEAD_INITIALIZER(vfio_group_list);
>> @@ -1313,11 +1314,94 @@ static int vfio_set_dirty_page_tracking(VFIOContainer
>> *container, bool start)
>>       return ret;
>>   }
>>   +/*
>> + * Called for the dirty tracking memory listener to calculate the iova/end
>> + * for a given memory region section. The checks here, replicate the logic
>> + * in vfio_listener_region_{add,del}() used for the same purpose. And thus
>> + * both listener should be kept in sync.
>> + */
>> +static bool vfio_get_section_iova_range(VFIOContainer *container,
>> +                                        MemoryRegionSection *section,
>> +                                        hwaddr *out_iova, hwaddr *out_end)
>> +{
>> +    Int128 llend;
>> +    hwaddr iova;
>> +
>> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
>> +    llend = int128_make64(section->offset_within_address_space);
>> +    llend = int128_add(llend, section->size);
>> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
>> +
>> +    if (int128_ge(int128_make64(iova), llend)) {
>> +        return false;
>> +    }
>> +
>> +    *out_iova = iova;
>> +    *out_end = int128_get64(llend) - 1;
>> +    return true;
>> +}
>> +
>> +static void vfio_dirty_tracking_update(MemoryListener *listener,
>> +                                       MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> +                                            tracking_listener);
>> +    VFIODirtyTrackingRange *range = &container->tracking_range;
>> +    hwaddr max32 = (1ULL << 32) - 1ULL;
>> +    hwaddr iova, end;
>> +
>> +    if (!vfio_listener_valid_section(section) ||
>> +        !vfio_get_section_iova_range(container, section, &iova, &end)) {
>> +        return;
>> +    }
>> +
>> +    WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) {
>> +        if (iova < max32 && end <= max32) {
>> +                if (range->min32 > iova) {
>> +                    range->min32 = iova;
>> +                }
>> +                if (range->max32 < end) {
>> +                    range->max32 = end;
>> +                }
>> +                trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                            range->min32, range->max32);
>> +        } else {
>> +                if (!range->min64 || range->min64 > iova) {
>> +                    range->min64 = iova;
>> +                }
>> +                if (range->max64 < end) {
>> +                    range->max64 = end;
>> +                }
>> +                trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                            range->min64, range->max64);
>> +        }
>> +    }
>> +    return;
>> +}
>> +
>> +static const MemoryListener vfio_dirty_tracking_listener = {
>> +    .name = "vfio-tracking",
>> +    .region_add = vfio_dirty_tracking_update,
>> +};
>> +
>> +static void vfio_dirty_tracking_init(VFIOContainer *container)
>> +{
>> +    memset(&container->tracking_range, 0, sizeof(container->tracking_range));
>> +    qemu_mutex_init(&container->tracking_mutex);
>> +    container->tracking_listener = vfio_dirty_tracking_listener;
>> +    memory_listener_register(&container->tracking_listener,
>> +                             container->space->as);
> 
> The following unregister+destroy calls seem to belong to a _fini routine.
> Am I missing something ?
> 
The thinking is that once we register the memory listener, it will iterate
over all the sections and once that is finished the memory_listener_register
returns. So the state we initialize here isn't needed anyelse other than to
create the range and hence we destroy it right away. It was at container_init()
but it was unnecessary overhead to keep around if it's *only* needed when we
start/stop dirty tracking.

So the reason I don't add a _fini method is because there's no need to teardown
the state anywhere else other than this function.

I would argue that maybe I don't need the mutex at all as this is all serialized...

> Thanks,
> 
> C.
> 
>> +    memory_listener_unregister(&container->tracking_listener);
>> +    qemu_mutex_destroy(&container->tracking_mutex);
>> +}
>> +
>>   static void vfio_listener_log_global_start(MemoryListener *listener)
>>   {
>>       VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>>       int ret;
>>   +    vfio_dirty_tracking_init(container);
>> +
>>       ret = vfio_set_dirty_page_tracking(container, true);
>>       if (ret) {
>>           vfio_set_migration_error(ret);
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 669d9fe07cd9..d97a6de17921 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t
>> iova, uint64_t offset_wi
>>   vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova,
>> uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64"
>> is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>>   vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING
>> region_del 0x%"PRIx64" - 0x%"PRIx64
>>   vfio_listener_region_del(uint64_t start, uint64_t end) "region_del
>> 0x%"PRIx64" - 0x%"PRIx64
>> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min,
>> uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" -
>> 0x%"PRIx64"]"
>>   vfio_disconnect_container(int fd) "close container->fd=%d"
>>   vfio_put_group(int fd) "close group->fd=%d"
>>   vfio_get_device(const char * name, unsigned int flags, unsigned int
>> num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 87524c64a443..96791add2719 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -23,6 +23,7 @@
>>     #include "exec/memory.h"
>>   #include "qemu/queue.h"
>> +#include "qemu/iova-tree.h"
>>   #include "qemu/notify.h"
>>   #include "ui/console.h"
>>   #include "hw/display/ramfb.h"
>> @@ -68,6 +69,13 @@ typedef struct VFIOMigration {
>>       size_t data_buffer_size;
>>   } VFIOMigration;
>>   +typedef struct VFIODirtyTrackingRange {
>> +    hwaddr min32;
>> +    hwaddr max32;
>> +    hwaddr min64;
>> +    hwaddr max64;
>> +} VFIODirtyTrackingRange;
>> +
>>   typedef struct VFIOAddressSpace {
>>       AddressSpace *as;
>>       QLIST_HEAD(, VFIOContainer) containers;
>> @@ -89,6 +97,9 @@ typedef struct VFIOContainer {
>>       uint64_t max_dirty_bitmap_size;
>>       unsigned long pgsizes;
>>       unsigned int dma_max_mappings;
>> +    VFIODirtyTrackingRange tracking_range;
>> +    QemuMutex tracking_mutex;
>> +    MemoryListener tracking_listener;
>>       QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>>       QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>>       QLIST_HEAD(, VFIOGroup) group_list;
>
Alex Williamson March 6, 2023, 3:11 p.m. UTC | #3
On Mon, 6 Mar 2023 14:37:04 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> On 06/03/2023 13:41, Cédric Le Goater wrote:
> > On 3/4/23 02:43, Joao Martins wrote:  
> >> According to the device DMA logging uAPI, IOVA ranges to be logged by
> >> the device must be provided all at once upon DMA logging start.
> >>
> >> As preparation for the following patches which will add device dirty
> >> page tracking, keep a record of all DMA mapped IOVA ranges so later they
> >> can be used for DMA logging start.
> >>
> >> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
> >> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
> >>
> >> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> >> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> >> ---
> >>   hw/vfio/common.c              | 84 +++++++++++++++++++++++++++++++++++
> >>   hw/vfio/trace-events          |  1 +
> >>   include/hw/vfio/vfio-common.h | 11 +++++
> >>   3 files changed, 96 insertions(+)
> >>
> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >> index ed908e303dbb..d84e5fd86bb4 100644
> >> --- a/hw/vfio/common.c
> >> +++ b/hw/vfio/common.c
> >> @@ -44,6 +44,7 @@
> >>   #include "migration/blocker.h"
> >>   #include "migration/qemu-file.h"
> >>   #include "sysemu/tpm.h"
> >> +#include "qemu/iova-tree.h"
> >>     VFIOGroupList vfio_group_list =
> >>       QLIST_HEAD_INITIALIZER(vfio_group_list);
> >> @@ -1313,11 +1314,94 @@ static int vfio_set_dirty_page_tracking(VFIOContainer
> >> *container, bool start)
> >>       return ret;
> >>   }
> >>   +/*
> >> + * Called for the dirty tracking memory listener to calculate the iova/end
> >> + * for a given memory region section. The checks here, replicate the logic
> >> + * in vfio_listener_region_{add,del}() used for the same purpose. And thus
> >> + * both listener should be kept in sync.
> >> + */
> >> +static bool vfio_get_section_iova_range(VFIOContainer *container,
> >> +                                        MemoryRegionSection *section,
> >> +                                        hwaddr *out_iova, hwaddr *out_end)
> >> +{
> >> +    Int128 llend;
> >> +    hwaddr iova;
> >> +
> >> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
> >> +    llend = int128_make64(section->offset_within_address_space);
> >> +    llend = int128_add(llend, section->size);
> >> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
> >> +
> >> +    if (int128_ge(int128_make64(iova), llend)) {
> >> +        return false;
> >> +    }
> >> +
> >> +    *out_iova = iova;
> >> +    *out_end = int128_get64(llend) - 1;
> >> +    return true;
> >> +}
> >> +
> >> +static void vfio_dirty_tracking_update(MemoryListener *listener,
> >> +                                       MemoryRegionSection *section)
> >> +{
> >> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> >> +                                            tracking_listener);
> >> +    VFIODirtyTrackingRange *range = &container->tracking_range;
> >> +    hwaddr max32 = (1ULL << 32) - 1ULL;
> >> +    hwaddr iova, end;
> >> +
> >> +    if (!vfio_listener_valid_section(section) ||
> >> +        !vfio_get_section_iova_range(container, section, &iova, &end)) {
> >> +        return;
> >> +    }
> >> +
> >> +    WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) {
> >> +        if (iova < max32 && end <= max32) {
> >> +                if (range->min32 > iova) {
> >> +                    range->min32 = iova;
> >> +                }
> >> +                if (range->max32 < end) {
> >> +                    range->max32 = end;
> >> +                }
> >> +                trace_vfio_device_dirty_tracking_update(iova, end,
> >> +                                            range->min32, range->max32);
> >> +        } else {
> >> +                if (!range->min64 || range->min64 > iova) {
> >> +                    range->min64 = iova;
> >> +                }
> >> +                if (range->max64 < end) {
> >> +                    range->max64 = end;
> >> +                }
> >> +                trace_vfio_device_dirty_tracking_update(iova, end,
> >> +                                            range->min64, range->max64);
> >> +        }
> >> +    }
> >> +    return;
> >> +}
> >> +
> >> +static const MemoryListener vfio_dirty_tracking_listener = {
> >> +    .name = "vfio-tracking",
> >> +    .region_add = vfio_dirty_tracking_update,
> >> +};
> >> +
> >> +static void vfio_dirty_tracking_init(VFIOContainer *container)
> >> +{
> >> +    memset(&container->tracking_range, 0, sizeof(container->tracking_range));
> >> +    qemu_mutex_init(&container->tracking_mutex);
> >> +    container->tracking_listener = vfio_dirty_tracking_listener;
> >> +    memory_listener_register(&container->tracking_listener,
> >> +                             container->space->as);  
> > 
> > The following unregister+destroy calls seem to belong to a _fini routine.
> > Am I missing something ?
> >   
> The thinking is that once we register the memory listener, it will iterate
> over all the sections and once that is finished the memory_listener_register
> returns. So the state we initialize here isn't needed anyelse other than to
> create the range and hence we destroy it right away. It was at container_init()
> but it was unnecessary overhead to keep around if it's *only* needed when we
> start/stop dirty tracking.
> 
> So the reason I don't add a _fini method is because there's no need to teardown
> the state anywhere else other than this function.
> 
> I would argue that maybe I don't need the mutex at all as this is all serialized...

Right, this is in line with my previous comments that we don't need to
keep the listener around since we don't expect changes to memory
regions, ex. virtio-mem slots are locked down during migration, and we
don't support removal of ranges from the device anyway.  We're done with
the listener after it's built our min/max ranges.  And yes, the mutex
seems superfluous.  Thanks,

Alex
Cédric Le Goater March 6, 2023, 6:05 p.m. UTC | #4
On 3/4/23 02:43, Joao Martins wrote:
> According to the device DMA logging uAPI, IOVA ranges to be logged by
> the device must be provided all at once upon DMA logging start.
> 
> As preparation for the following patches which will add device dirty
> page tracking, keep a record of all DMA mapped IOVA ranges so later they
> can be used for DMA logging start.
> 
> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
> 
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>

One question below,

Reviewed-by: Cédric Le Goater <clg@redhat.com>

Thanks,

C.

> ---
>   hw/vfio/common.c              | 84 +++++++++++++++++++++++++++++++++++
>   hw/vfio/trace-events          |  1 +
>   include/hw/vfio/vfio-common.h | 11 +++++
>   3 files changed, 96 insertions(+)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ed908e303dbb..d84e5fd86bb4 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -44,6 +44,7 @@
>   #include "migration/blocker.h"
>   #include "migration/qemu-file.h"
>   #include "sysemu/tpm.h"
> +#include "qemu/iova-tree.h"
>   
>   VFIOGroupList vfio_group_list =
>       QLIST_HEAD_INITIALIZER(vfio_group_list);
> @@ -1313,11 +1314,94 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>       return ret;
>   }
>   
> +/*
> + * Called for the dirty tracking memory listener to calculate the iova/end
> + * for a given memory region section. The checks here, replicate the logic
> + * in vfio_listener_region_{add,del}() used for the same purpose. And thus
> + * both listener should be kept in sync.
> + */
> +static bool vfio_get_section_iova_range(VFIOContainer *container,
> +                                        MemoryRegionSection *section,
> +                                        hwaddr *out_iova, hwaddr *out_end)
> +{
> +    Int128 llend;
> +    hwaddr iova;
> +
> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
> +    llend = int128_make64(section->offset_within_address_space);
> +    llend = int128_add(llend, section->size);
> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
> +
> +    if (int128_ge(int128_make64(iova), llend)) {
> +        return false;
> +    }
> +
> +    *out_iova = iova;
> +    *out_end = int128_get64(llend) - 1;
> +    return true;
> +}
> +
> +static void vfio_dirty_tracking_update(MemoryListener *listener,
> +                                       MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            tracking_listener);
> +    VFIODirtyTrackingRange *range = &container->tracking_range;
> +    hwaddr max32 = (1ULL << 32) - 1ULL;
> +    hwaddr iova, end;
> +
> +    if (!vfio_listener_valid_section(section) ||
> +        !vfio_get_section_iova_range(container, section, &iova, &end)) {
> +        return;
> +    }
> +
> +    WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) {
> +        if (iova < max32 && end <= max32) {
> +                if (range->min32 > iova) {

With the memset(0) done in vfio_dirty_tracking_init(), min32 will always
be 0. Is that OK ?

> +                    range->min32 = iova;
> +                }
> +                if (range->max32 < end) {
> +                    range->max32 = end;
> +                }
> +                trace_vfio_device_dirty_tracking_update(iova, end,
> +                                            range->min32, range->max32);
> +        } else {
> +                if (!range->min64 || range->min64 > iova) {
> +                    range->min64 = iova;
> +                }
> +                if (range->max64 < end) {
> +                    range->max64 = end;
> +                }
> +                trace_vfio_device_dirty_tracking_update(iova, end,
> +                                            range->min64, range->max64);
> +        }
> +    }
> +    return;
> +}
> +
> +static const MemoryListener vfio_dirty_tracking_listener = {
> +    .name = "vfio-tracking",
> +    .region_add = vfio_dirty_tracking_update,
> +};
> +
> +static void vfio_dirty_tracking_init(VFIOContainer *container)
> +{
> +    memset(&container->tracking_range, 0, sizeof(container->tracking_range));
> +    qemu_mutex_init(&container->tracking_mutex);
> +    container->tracking_listener = vfio_dirty_tracking_listener;
> +    memory_listener_register(&container->tracking_listener,
> +                             container->space->as);
> +    memory_listener_unregister(&container->tracking_listener);
> +    qemu_mutex_destroy(&container->tracking_mutex);
> +}
> +
>   static void vfio_listener_log_global_start(MemoryListener *listener)
>   {
>       VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>       int ret;
>   
> +    vfio_dirty_tracking_init(container);
> +
>       ret = vfio_set_dirty_page_tracking(container, true);
>       if (ret) {
>           vfio_set_migration_error(ret);
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 669d9fe07cd9..d97a6de17921 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>   vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>   vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
>   vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
>   vfio_disconnect_container(int fd) "close container->fd=%d"
>   vfio_put_group(int fd) "close group->fd=%d"
>   vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 87524c64a443..96791add2719 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -23,6 +23,7 @@
>   
>   #include "exec/memory.h"
>   #include "qemu/queue.h"
> +#include "qemu/iova-tree.h"
>   #include "qemu/notify.h"
>   #include "ui/console.h"
>   #include "hw/display/ramfb.h"
> @@ -68,6 +69,13 @@ typedef struct VFIOMigration {
>       size_t data_buffer_size;
>   } VFIOMigration;
>   
> +typedef struct VFIODirtyTrackingRange {
> +    hwaddr min32;
> +    hwaddr max32;
> +    hwaddr min64;
> +    hwaddr max64;
> +} VFIODirtyTrackingRange;
> +
>   typedef struct VFIOAddressSpace {
>       AddressSpace *as;
>       QLIST_HEAD(, VFIOContainer) containers;
> @@ -89,6 +97,9 @@ typedef struct VFIOContainer {
>       uint64_t max_dirty_bitmap_size;
>       unsigned long pgsizes;
>       unsigned int dma_max_mappings;
> +    VFIODirtyTrackingRange tracking_range;
> +    QemuMutex tracking_mutex;
> +    MemoryListener tracking_listener;
>       QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>       QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>       QLIST_HEAD(, VFIOGroup) group_list;
Alex Williamson March 6, 2023, 6:15 p.m. UTC | #5
On Sat,  4 Mar 2023 01:43:37 +0000
Joao Martins <joao.m.martins@oracle.com> wrote:

> According to the device DMA logging uAPI, IOVA ranges to be logged by
> the device must be provided all at once upon DMA logging start.
> 
> As preparation for the following patches which will add device dirty
> page tracking, keep a record of all DMA mapped IOVA ranges so later they
> can be used for DMA logging start.
> 
> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.

Commit log is outdated for this version.

> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> ---
>  hw/vfio/common.c              | 84 +++++++++++++++++++++++++++++++++++
>  hw/vfio/trace-events          |  1 +
>  include/hw/vfio/vfio-common.h | 11 +++++
>  3 files changed, 96 insertions(+)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ed908e303dbb..d84e5fd86bb4 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -44,6 +44,7 @@
>  #include "migration/blocker.h"
>  #include "migration/qemu-file.h"
>  #include "sysemu/tpm.h"
> +#include "qemu/iova-tree.h"

Unnecessary

>  
>  VFIOGroupList vfio_group_list =
>      QLIST_HEAD_INITIALIZER(vfio_group_list);
> @@ -1313,11 +1314,94 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>      return ret;
>  }
>  
> +/*
> + * Called for the dirty tracking memory listener to calculate the iova/end
> + * for a given memory region section. The checks here, replicate the logic
> + * in vfio_listener_region_{add,del}() used for the same purpose. And thus
> + * both listener should be kept in sync.
> + */
> +static bool vfio_get_section_iova_range(VFIOContainer *container,
> +                                        MemoryRegionSection *section,
> +                                        hwaddr *out_iova, hwaddr *out_end)
> +{
> +    Int128 llend;
> +    hwaddr iova;
> +
> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
> +    llend = int128_make64(section->offset_within_address_space);
> +    llend = int128_add(llend, section->size);
> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
> +
> +    if (int128_ge(int128_make64(iova), llend)) {
> +        return false;
> +    }
> +
> +    *out_iova = iova;
> +    *out_end = int128_get64(llend) - 1;
> +    return true;
> +}

Not sure why this isn't turned into a helper here to avoid the issue
noted in the comment.  Also why do both of the existing listener
implementations resolve the end address as:

	int128_get64(int128_sub(llend, int128_one()));

While here we use:

	int128_get64(llend) - 1;

We're already out of sync.

> +
> +static void vfio_dirty_tracking_update(MemoryListener *listener,
> +                                       MemoryRegionSection *section)
> +{
> +    VFIOContainer *container = container_of(listener, VFIOContainer,
> +                                            tracking_listener);
> +    VFIODirtyTrackingRange *range = &container->tracking_range;
> +    hwaddr max32 = (1ULL << 32) - 1ULL;

UINT32_MAX

> +    hwaddr iova, end;
> +
> +    if (!vfio_listener_valid_section(section) ||
> +        !vfio_get_section_iova_range(container, section, &iova, &end)) {
> +        return;
> +    }
> +
> +    WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) {
> +        if (iova < max32 && end <= max32) {
> +                if (range->min32 > iova) {
> +                    range->min32 = iova;
> +                }
> +                if (range->max32 < end) {
> +                    range->max32 = end;
> +                }
> +                trace_vfio_device_dirty_tracking_update(iova, end,
> +                                            range->min32, range->max32);
> +        } else {
> +                if (!range->min64 || range->min64 > iova) {
> +                    range->min64 = iova;
> +                }

I think this improperly handles a range starting at zero, min64 should
be UINT64_MAX initially.  For example, if we have ranges 0-8GB and
12-16GB, this effectively ignores the first range.  Likewise I think
range->min32 has a similar problem, it's initialized to zero, it will
never be updated to match a non-zero lowest range.  It needs to be
initialized to UINT32_MAX.

A comment describing the purpose of the 32/64 split tracking would be
useful too.

> +                if (range->max64 < end) {
> +                    range->max64 = end;
> +                }
> +                trace_vfio_device_dirty_tracking_update(iova, end,
> +                                            range->min64, range->max64);
> +        }
> +    }
> +    return;
> +}
> +
> +static const MemoryListener vfio_dirty_tracking_listener = {
> +    .name = "vfio-tracking",
> +    .region_add = vfio_dirty_tracking_update,
> +};
> +
> +static void vfio_dirty_tracking_init(VFIOContainer *container)
> +{
> +    memset(&container->tracking_range, 0, sizeof(container->tracking_range));
> +    qemu_mutex_init(&container->tracking_mutex);

As noted in other thread, this mutex seems unnecessary.

The listener needs to be embedded in an object, but it doesn't need to
be the container.  Couldn't we create:

typedef struct VFIODirtyRanges {
    VFIOContainer *container;
    VFIODirtyTrackingRange ranges;
    MemoryListener listener;
} VFIODirectRanges;

For use here?  Caller could provide VFIODirtyTrackingRange pointer for
the resulting ranges, which then gets passed to
vfio_device_feature_dma_logging_start_create()

> +    container->tracking_listener = vfio_dirty_tracking_listener;
> +    memory_listener_register(&container->tracking_listener,
> +                             container->space->as);
> +    memory_listener_unregister(&container->tracking_listener);

It's sufficiently subtle that the listener callback is synchronous and
we're done with it here for a comment.

> +    qemu_mutex_destroy(&container->tracking_mutex);
> +}
> +
>  static void vfio_listener_log_global_start(MemoryListener *listener)
>  {
>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>      int ret;
>  
> +    vfio_dirty_tracking_init(container);
> +
>      ret = vfio_set_dirty_page_tracking(container, true);
>      if (ret) {
>          vfio_set_migration_error(ret);
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 669d9fe07cd9..d97a6de17921 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>  vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>  vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
>  vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
>  vfio_disconnect_container(int fd) "close container->fd=%d"
>  vfio_put_group(int fd) "close group->fd=%d"
>  vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 87524c64a443..96791add2719 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -23,6 +23,7 @@
>  
>  #include "exec/memory.h"
>  #include "qemu/queue.h"
> +#include "qemu/iova-tree.h"

Unused.

>  #include "qemu/notify.h"
>  #include "ui/console.h"
>  #include "hw/display/ramfb.h"
> @@ -68,6 +69,13 @@ typedef struct VFIOMigration {
>      size_t data_buffer_size;
>  } VFIOMigration;
>  
> +typedef struct VFIODirtyTrackingRange {
> +    hwaddr min32;
> +    hwaddr max32;
> +    hwaddr min64;
> +    hwaddr max64;
> +} VFIODirtyTrackingRange;
> +
>  typedef struct VFIOAddressSpace {
>      AddressSpace *as;
>      QLIST_HEAD(, VFIOContainer) containers;
> @@ -89,6 +97,9 @@ typedef struct VFIOContainer {
>      uint64_t max_dirty_bitmap_size;
>      unsigned long pgsizes;
>      unsigned int dma_max_mappings;
> +    VFIODirtyTrackingRange tracking_range;
> +    QemuMutex tracking_mutex;
> +    MemoryListener tracking_listener;

Let's make this unnecessary too.  Thanks,

Alex

>      QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>      QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>      QLIST_HEAD(, VFIOGroup) group_list;
Joao Martins March 6, 2023, 7:32 p.m. UTC | #6
On 06/03/2023 18:15, Alex Williamson wrote:
> On Sat,  4 Mar 2023 01:43:37 +0000
> Joao Martins <joao.m.martins@oracle.com> wrote:
> 
>> According to the device DMA logging uAPI, IOVA ranges to be logged by
>> the device must be provided all at once upon DMA logging start.
>>
>> As preparation for the following patches which will add device dirty
>> page tracking, keep a record of all DMA mapped IOVA ranges so later they
>> can be used for DMA logging start.
>>
>> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
>> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
> 
> Commit log is outdated for this version.
>
I will remove the paragraph. I can't mention vIOMMU usage blocks migration as I
effectively do that later in the series.

>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> ---
>>  hw/vfio/common.c              | 84 +++++++++++++++++++++++++++++++++++
>>  hw/vfio/trace-events          |  1 +
>>  include/hw/vfio/vfio-common.h | 11 +++++
>>  3 files changed, 96 insertions(+)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index ed908e303dbb..d84e5fd86bb4 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -44,6 +44,7 @@
>>  #include "migration/blocker.h"
>>  #include "migration/qemu-file.h"
>>  #include "sysemu/tpm.h"
>> +#include "qemu/iova-tree.h"
> 
> Unnecessary
> 
True, I had it removed for v4 as Avihai pointed that out to me too offlist.
Same for the other one down below.

>>  
>>  VFIOGroupList vfio_group_list =
>>      QLIST_HEAD_INITIALIZER(vfio_group_list);
>> @@ -1313,11 +1314,94 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>>      return ret;
>>  }
>>  
>> +/*
>> + * Called for the dirty tracking memory listener to calculate the iova/end
>> + * for a given memory region section. The checks here, replicate the logic
>> + * in vfio_listener_region_{add,del}() used for the same purpose. And thus
>> + * both listener should be kept in sync.
>> + */
>> +static bool vfio_get_section_iova_range(VFIOContainer *container,
>> +                                        MemoryRegionSection *section,
>> +                                        hwaddr *out_iova, hwaddr *out_end)
>> +{
>> +    Int128 llend;
>> +    hwaddr iova;
>> +
>> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
>> +    llend = int128_make64(section->offset_within_address_space);
>> +    llend = int128_add(llend, section->size);
>> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
>> +
>> +    if (int128_ge(int128_make64(iova), llend)) {
>> +        return false;
>> +    }
>> +
>> +    *out_iova = iova;
>> +    *out_end = int128_get64(llend) - 1;
>> +    return true;
>> +}
> 
> Not sure why this isn't turned into a helper here to avoid the issue
> noted in the comment. 

The reason I didn't directly reused, as the calculation happens in different
places in the existing listener. But I noticed now that I am confusing with
llsize (in the old checks I have now removed). @end is not used in the check
that preceeds it, so I am moving this calculation into a helper. Presumably I
have a new preceeding patch where I have this vfio_get_section_iova_range()
added. and this patch just uses it.

> Also why do both of the existing listener
> implementations resolve the end address as:
> 
> 	int128_get64(int128_sub(llend, int128_one()));
> 
> While here we use:
> 
> 	int128_get64(llend) - 1;
> 
> We're already out of sync.
> 
True

>> +
>> +static void vfio_dirty_tracking_update(MemoryListener *listener,
>> +                                       MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> +                                            tracking_listener);
>> +    VFIODirtyTrackingRange *range = &container->tracking_range;
>> +    hwaddr max32 = (1ULL << 32) - 1ULL;
> 
> UINT32_MAX
> 
/me nods

>> +    hwaddr iova, end;
>> +
>> +    if (!vfio_listener_valid_section(section) ||
>> +        !vfio_get_section_iova_range(container, section, &iova, &end)) {
>> +        return;
>> +    }
>> +
>> +    WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) {
>> +        if (iova < max32 && end <= max32) {
>> +                if (range->min32 > iova) {
>> +                    range->min32 = iova;
>> +                }
>> +                if (range->max32 < end) {
>> +                    range->max32 = end;
>> +                }
>> +                trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                            range->min32, range->max32);
>> +        } else {
>> +                if (!range->min64 || range->min64 > iova) {
>> +                    range->min64 = iova;
>> +                }
> 
> I think this improperly handles a range starting at zero, min64 should
> be UINT64_MAX initially.  For example, if we have ranges 0-8GB and
> 12-16GB, this effectively ignores the first range.  Likewise I think
> range->min32 has a similar problem, it's initialized to zero, it will
> never be updated to match a non-zero lowest range.  It needs to be
> initialized to UINT32_MAX.
> 
Yes, let me switch to that. I'll place the min64/min32 to UINT64_MAX/UINT32_MAX
in the place where we initialize the state for the dma tracking listener.

> A comment describing the purpose of the 32/64 split tracking would be
> useful too.
> 

Yes, definitely e.g.

/*
 * The address space passed to the dirty tracker is reduced to two ranges: one
 * for 32-bit DMA ranges, and another one for 64-bit DMA ranges. The underlying
 * reports of dirty will query a sub-interval of each of these ranges. The
 * purpose of the dual range handling is to handle known cases of big holes in
 * the address space, like the x86 AMD 1T hole. The alternative would be an
 * IOVATree but that has a much bigger runtime overhead and unnecessary
 * complexity.
 */

>> +                if (range->max64 < end) {
>> +                    range->max64 = end;
>> +                }
>> +                trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                            range->min64, range->max64);
>> +        }
>> +    }
>> +    return;
>> +}
>> +
>> +static const MemoryListener vfio_dirty_tracking_listener = {
>> +    .name = "vfio-tracking",
>> +    .region_add = vfio_dirty_tracking_update,
>> +};
>> +
>> +static void vfio_dirty_tracking_init(VFIOContainer *container)
>> +{
>> +    memset(&container->tracking_range, 0, sizeof(container->tracking_range));
>> +    qemu_mutex_init(&container->tracking_mutex);
> 
> As noted in other thread, this mutex seems unnecessary.
> 
Already deleted it for v4.

> The listener needs to be embedded in an object, but it doesn't need to
> be the container.  Couldn't we create:
> 
> typedef struct VFIODirtyRanges {
>     VFIOContainer *container;
>     VFIODirtyTrackingRange ranges;
>     MemoryListener listener;
> } VFIODirectRanges;
> 
> For use here?  Caller could provide VFIODirtyTrackingRange pointer for
> the resulting ranges, which then gets passed to
> vfio_device_feature_dma_logging_start_create()

Oh, that would be so much cleaner, yes. Will switch to that.

> 
>> +    container->tracking_listener = vfio_dirty_tracking_listener;
>> +    memory_listener_register(&container->tracking_listener,
>> +                             container->space->as);
>> +    memory_listener_unregister(&container->tracking_listener);
> 
> It's sufficiently subtle that the listener callback is synchronous and
> we're done with it here for a comment.
> 

Will add a comment e.g.:

 /*
  * The memory listener is synchronous, and used to calculate the range to dirty
  * tracking. Unregister it after we are done as we are not interesting in any
  * follow-up updates.
  */

>> +    qemu_mutex_destroy(&container->tracking_mutex);
>> +}
>> +
>>  static void vfio_listener_log_global_start(MemoryListener *listener)
>>  {
>>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>>      int ret;
>>  
>> +    vfio_dirty_tracking_init(container);
>> +
>>      ret = vfio_set_dirty_page_tracking(container, true);
>>      if (ret) {
>>          vfio_set_migration_error(ret);
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 669d9fe07cd9..d97a6de17921 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
>>  vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>>  vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
>>  vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
>> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
>>  vfio_disconnect_container(int fd) "close container->fd=%d"
>>  vfio_put_group(int fd) "close group->fd=%d"
>>  vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 87524c64a443..96791add2719 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -23,6 +23,7 @@
>>  
>>  #include "exec/memory.h"
>>  #include "qemu/queue.h"
>> +#include "qemu/iova-tree.h"
> 
> Unused.
> 
Had it removed already preemptively

>>  #include "qemu/notify.h"
>>  #include "ui/console.h"
>>  #include "hw/display/ramfb.h"
>> @@ -68,6 +69,13 @@ typedef struct VFIOMigration {
>>      size_t data_buffer_size;
>>  } VFIOMigration;
>>  
>> +typedef struct VFIODirtyTrackingRange {
>> +    hwaddr min32;
>> +    hwaddr max32;
>> +    hwaddr min64;
>> +    hwaddr max64;
>> +} VFIODirtyTrackingRange;
>> +
>>  typedef struct VFIOAddressSpace {
>>      AddressSpace *as;
>>      QLIST_HEAD(, VFIOContainer) containers;
>> @@ -89,6 +97,9 @@ typedef struct VFIOContainer {
>>      uint64_t max_dirty_bitmap_size;
>>      unsigned long pgsizes;
>>      unsigned int dma_max_mappings;
>> +    VFIODirtyTrackingRange tracking_range;
>> +    QemuMutex tracking_mutex;
>> +    MemoryListener tracking_listener;
> 
> Let's make this unnecessary too.  Thanks,
> 
Got it.
Joao Martins March 6, 2023, 7:45 p.m. UTC | #7
On 06/03/2023 18:05, Cédric Le Goater wrote:
> On 3/4/23 02:43, Joao Martins wrote:
>> According to the device DMA logging uAPI, IOVA ranges to be logged by
>> the device must be provided all at once upon DMA logging start.
>>
>> As preparation for the following patches which will add device dirty
>> page tracking, keep a record of all DMA mapped IOVA ranges so later they
>> can be used for DMA logging start.
>>
>> Note that when vIOMMU is enabled DMA mapped IOVA ranges are not tracked.
>> This is due to the dynamic nature of vIOMMU DMA mapping/unmapping.
>>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> 
> One question below,
> 
> Reviewed-by: Cédric Le Goater <clg@redhat.com>
> 
> Thanks,
> 
> C.
> 
>> ---
>>   hw/vfio/common.c              | 84 +++++++++++++++++++++++++++++++++++
>>   hw/vfio/trace-events          |  1 +
>>   include/hw/vfio/vfio-common.h | 11 +++++
>>   3 files changed, 96 insertions(+)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index ed908e303dbb..d84e5fd86bb4 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -44,6 +44,7 @@
>>   #include "migration/blocker.h"
>>   #include "migration/qemu-file.h"
>>   #include "sysemu/tpm.h"
>> +#include "qemu/iova-tree.h"
>>     VFIOGroupList vfio_group_list =
>>       QLIST_HEAD_INITIALIZER(vfio_group_list);
>> @@ -1313,11 +1314,94 @@ static int vfio_set_dirty_page_tracking(VFIOContainer
>> *container, bool start)
>>       return ret;
>>   }
>>   +/*
>> + * Called for the dirty tracking memory listener to calculate the iova/end
>> + * for a given memory region section. The checks here, replicate the logic
>> + * in vfio_listener_region_{add,del}() used for the same purpose. And thus
>> + * both listener should be kept in sync.
>> + */
>> +static bool vfio_get_section_iova_range(VFIOContainer *container,
>> +                                        MemoryRegionSection *section,
>> +                                        hwaddr *out_iova, hwaddr *out_end)
>> +{
>> +    Int128 llend;
>> +    hwaddr iova;
>> +
>> +    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
>> +    llend = int128_make64(section->offset_within_address_space);
>> +    llend = int128_add(llend, section->size);
>> +    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
>> +
>> +    if (int128_ge(int128_make64(iova), llend)) {
>> +        return false;
>> +    }
>> +
>> +    *out_iova = iova;
>> +    *out_end = int128_get64(llend) - 1;
>> +    return true;
>> +}
>> +
>> +static void vfio_dirty_tracking_update(MemoryListener *listener,
>> +                                       MemoryRegionSection *section)
>> +{
>> +    VFIOContainer *container = container_of(listener, VFIOContainer,
>> +                                            tracking_listener);
>> +    VFIODirtyTrackingRange *range = &container->tracking_range;
>> +    hwaddr max32 = (1ULL << 32) - 1ULL;
>> +    hwaddr iova, end;
>> +
>> +    if (!vfio_listener_valid_section(section) ||
>> +        !vfio_get_section_iova_range(container, section, &iova, &end)) {
>> +        return;
>> +    }
>> +
>> +    WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) {
>> +        if (iova < max32 && end <= max32) {
>> +                if (range->min32 > iova) {
> 
> With the memset(0) done in vfio_dirty_tracking_init(), min32 will always
> be 0. Is that OK ?
> 
While it's OK, it's making an assumption that it's zero-ed out. But Alex
comments will make this more clear (and cover all cases) and avoid assumption of
always having a range starting from 0.

>> +                    range->min32 = iova;
>> +                }
>> +                if (range->max32 < end) {
>> +                    range->max32 = end;
>> +                }
>> +                trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                            range->min32, range->max32);
>> +        } else {
>> +                if (!range->min64 || range->min64 > iova) {
>> +                    range->min64 = iova;
>> +                }
>> +                if (range->max64 < end) {
>> +                    range->max64 = end;
>> +                }
>> +                trace_vfio_device_dirty_tracking_update(iova, end,
>> +                                            range->min64, range->max64);
>> +        }
>> +    }
>> +    return;
>> +}
>> +
>> +static const MemoryListener vfio_dirty_tracking_listener = {
>> +    .name = "vfio-tracking",
>> +    .region_add = vfio_dirty_tracking_update,
>> +};
>> +
>> +static void vfio_dirty_tracking_init(VFIOContainer *container)
>> +{
>> +    memset(&container->tracking_range, 0, sizeof(container->tracking_range));
>> +    qemu_mutex_init(&container->tracking_mutex);
>> +    container->tracking_listener = vfio_dirty_tracking_listener;
>> +    memory_listener_register(&container->tracking_listener,
>> +                             container->space->as);
>> +    memory_listener_unregister(&container->tracking_listener);
>> +    qemu_mutex_destroy(&container->tracking_mutex);
>> +}
>> +
>>   static void vfio_listener_log_global_start(MemoryListener *listener)
>>   {
>>       VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>>       int ret;
>>   +    vfio_dirty_tracking_init(container);
>> +
>>       ret = vfio_set_dirty_page_tracking(container, true);
>>       if (ret) {
>>           vfio_set_migration_error(ret);
>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>> index 669d9fe07cd9..d97a6de17921 100644
>> --- a/hw/vfio/trace-events
>> +++ b/hw/vfio/trace-events
>> @@ -104,6 +104,7 @@ vfio_known_safe_misalignment(const char *name, uint64_t
>> iova, uint64_t offset_wi
>>   vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova,
>> uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64"
>> is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
>>   vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING
>> region_del 0x%"PRIx64" - 0x%"PRIx64
>>   vfio_listener_region_del(uint64_t start, uint64_t end) "region_del
>> 0x%"PRIx64" - 0x%"PRIx64
>> +vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min,
>> uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" -
>> 0x%"PRIx64"]"
>>   vfio_disconnect_container(int fd) "close container->fd=%d"
>>   vfio_put_group(int fd) "close group->fd=%d"
>>   vfio_get_device(const char * name, unsigned int flags, unsigned int
>> num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 87524c64a443..96791add2719 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -23,6 +23,7 @@
>>     #include "exec/memory.h"
>>   #include "qemu/queue.h"
>> +#include "qemu/iova-tree.h"
>>   #include "qemu/notify.h"
>>   #include "ui/console.h"
>>   #include "hw/display/ramfb.h"
>> @@ -68,6 +69,13 @@ typedef struct VFIOMigration {
>>       size_t data_buffer_size;
>>   } VFIOMigration;
>>   +typedef struct VFIODirtyTrackingRange {
>> +    hwaddr min32;
>> +    hwaddr max32;
>> +    hwaddr min64;
>> +    hwaddr max64;
>> +} VFIODirtyTrackingRange;
>> +
>>   typedef struct VFIOAddressSpace {
>>       AddressSpace *as;
>>       QLIST_HEAD(, VFIOContainer) containers;
>> @@ -89,6 +97,9 @@ typedef struct VFIOContainer {
>>       uint64_t max_dirty_bitmap_size;
>>       unsigned long pgsizes;
>>       unsigned int dma_max_mappings;
>> +    VFIODirtyTrackingRange tracking_range;
>> +    QemuMutex tracking_mutex;
>> +    MemoryListener tracking_listener;
>>       QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
>>       QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
>>       QLIST_HEAD(, VFIOGroup) group_list;
>
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index ed908e303dbb..d84e5fd86bb4 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -44,6 +44,7 @@ 
 #include "migration/blocker.h"
 #include "migration/qemu-file.h"
 #include "sysemu/tpm.h"
+#include "qemu/iova-tree.h"
 
 VFIOGroupList vfio_group_list =
     QLIST_HEAD_INITIALIZER(vfio_group_list);
@@ -1313,11 +1314,94 @@  static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
     return ret;
 }
 
+/*
+ * Called for the dirty tracking memory listener to calculate the iova/end
+ * for a given memory region section. The checks here, replicate the logic
+ * in vfio_listener_region_{add,del}() used for the same purpose. And thus
+ * both listener should be kept in sync.
+ */
+static bool vfio_get_section_iova_range(VFIOContainer *container,
+                                        MemoryRegionSection *section,
+                                        hwaddr *out_iova, hwaddr *out_end)
+{
+    Int128 llend;
+    hwaddr iova;
+
+    iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
+    llend = int128_make64(section->offset_within_address_space);
+    llend = int128_add(llend, section->size);
+    llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
+
+    if (int128_ge(int128_make64(iova), llend)) {
+        return false;
+    }
+
+    *out_iova = iova;
+    *out_end = int128_get64(llend) - 1;
+    return true;
+}
+
+static void vfio_dirty_tracking_update(MemoryListener *listener,
+                                       MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            tracking_listener);
+    VFIODirtyTrackingRange *range = &container->tracking_range;
+    hwaddr max32 = (1ULL << 32) - 1ULL;
+    hwaddr iova, end;
+
+    if (!vfio_listener_valid_section(section) ||
+        !vfio_get_section_iova_range(container, section, &iova, &end)) {
+        return;
+    }
+
+    WITH_QEMU_LOCK_GUARD(&container->tracking_mutex) {
+        if (iova < max32 && end <= max32) {
+                if (range->min32 > iova) {
+                    range->min32 = iova;
+                }
+                if (range->max32 < end) {
+                    range->max32 = end;
+                }
+                trace_vfio_device_dirty_tracking_update(iova, end,
+                                            range->min32, range->max32);
+        } else {
+                if (!range->min64 || range->min64 > iova) {
+                    range->min64 = iova;
+                }
+                if (range->max64 < end) {
+                    range->max64 = end;
+                }
+                trace_vfio_device_dirty_tracking_update(iova, end,
+                                            range->min64, range->max64);
+        }
+    }
+    return;
+}
+
+static const MemoryListener vfio_dirty_tracking_listener = {
+    .name = "vfio-tracking",
+    .region_add = vfio_dirty_tracking_update,
+};
+
+static void vfio_dirty_tracking_init(VFIOContainer *container)
+{
+    memset(&container->tracking_range, 0, sizeof(container->tracking_range));
+    qemu_mutex_init(&container->tracking_mutex);
+    container->tracking_listener = vfio_dirty_tracking_listener;
+    memory_listener_register(&container->tracking_listener,
+                             container->space->as);
+    memory_listener_unregister(&container->tracking_listener);
+    qemu_mutex_destroy(&container->tracking_mutex);
+}
+
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
     int ret;
 
+    vfio_dirty_tracking_init(container);
+
     ret = vfio_set_dirty_page_tracking(container, true);
     if (ret) {
         vfio_set_migration_error(ret);
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 669d9fe07cd9..d97a6de17921 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -104,6 +104,7 @@  vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_wi
 vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
 vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
 vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
+vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
 vfio_disconnect_container(int fd) "close container->fd=%d"
 vfio_put_group(int fd) "close group->fd=%d"
 vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 87524c64a443..96791add2719 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -23,6 +23,7 @@ 
 
 #include "exec/memory.h"
 #include "qemu/queue.h"
+#include "qemu/iova-tree.h"
 #include "qemu/notify.h"
 #include "ui/console.h"
 #include "hw/display/ramfb.h"
@@ -68,6 +69,13 @@  typedef struct VFIOMigration {
     size_t data_buffer_size;
 } VFIOMigration;
 
+typedef struct VFIODirtyTrackingRange {
+    hwaddr min32;
+    hwaddr max32;
+    hwaddr min64;
+    hwaddr max64;
+} VFIODirtyTrackingRange;
+
 typedef struct VFIOAddressSpace {
     AddressSpace *as;
     QLIST_HEAD(, VFIOContainer) containers;
@@ -89,6 +97,9 @@  typedef struct VFIOContainer {
     uint64_t max_dirty_bitmap_size;
     unsigned long pgsizes;
     unsigned int dma_max_mappings;
+    VFIODirtyTrackingRange tracking_range;
+    QemuMutex tracking_mutex;
+    MemoryListener tracking_listener;
     QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
     QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
     QLIST_HEAD(, VFIOGroup) group_list;