diff mbox series

[v2,11/20] vfio/common: Add device dirty page tracking start/stop

Message ID 20230222174915.5647-12-avihaih@nvidia.com
State New
Headers show
Series vfio: Add migration pre-copy support and device dirty tracking | expand

Commit Message

Avihai Horon Feb. 22, 2023, 5:49 p.m. UTC
From: Joao Martins <joao.m.martins@oracle.com>

Add device dirty page tracking start/stop functionality. This uses the
device DMA logging uAPI to start and stop dirty page tracking by device.

Device dirty page tracking is used only if all devices within a
container support device dirty page tracking.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 include/hw/vfio/vfio-common.h |   2 +
 hw/vfio/common.c              | 211 +++++++++++++++++++++++++++++++++-
 2 files changed, 211 insertions(+), 2 deletions(-)

Comments

Alex Williamson Feb. 22, 2023, 10:40 p.m. UTC | #1
On Wed, 22 Feb 2023 19:49:06 +0200
Avihai Horon <avihaih@nvidia.com> wrote:

> From: Joao Martins <joao.m.martins@oracle.com>
> 
> Add device dirty page tracking start/stop functionality. This uses the
> device DMA logging uAPI to start and stop dirty page tracking by device.
> 
> Device dirty page tracking is used only if all devices within a
> container support device dirty page tracking.
> 
> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
>  include/hw/vfio/vfio-common.h |   2 +
>  hw/vfio/common.c              | 211 +++++++++++++++++++++++++++++++++-
>  2 files changed, 211 insertions(+), 2 deletions(-)
> 
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 6f36876ce0..1f21e1fa43 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -149,6 +149,8 @@ typedef struct VFIODevice {
>      VFIOMigration *migration;
>      Error *migration_blocker;
>      OnOffAuto pre_copy_dirty_page_tracking;
> +    bool dirty_pages_supported;
> +    bool dirty_tracking;
>  } VFIODevice;
>  
>  struct VFIODeviceOps {
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 6041da6c7e..740153e7d7 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -473,6 +473,22 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
>      return true;
>  }
>  
> +static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
> +{
> +    VFIOGroup *group;
> +    VFIODevice *vbasedev;
> +
> +    QLIST_FOREACH(group, &container->group_list, container_next) {
> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if (!vbasedev->dirty_pages_supported) {
> +                return false;
> +            }
> +        }
> +    }
> +
> +    return true;
> +}
> +
>  /*
>   * Check if all VFIO devices are running and migration is active, which is
>   * essentially equivalent to the migration being in pre-copy phase.
> @@ -1404,13 +1420,192 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>      return ret;
>  }
>  
> +static int vfio_devices_dma_logging_set(VFIOContainer *container,
> +                                        struct vfio_device_feature *feature)
> +{
> +    bool status = (feature->flags & VFIO_DEVICE_FEATURE_MASK) ==
> +                  VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
> +    VFIODevice *vbasedev;
> +    VFIOGroup *group;
> +    int ret = 0;
> +
> +    QLIST_FOREACH(group, &container->group_list, container_next) {
> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if (vbasedev->dirty_tracking == status) {
> +                continue;
> +            }
> +
> +            ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
> +            if (ret) {
> +                ret = -errno;
> +                error_report("%s: Failed to set DMA logging %s, err %d (%s)",
> +                             vbasedev->name, status ? "start" : "stop", ret,
> +                             strerror(errno));
> +                goto out;
> +            }
> +            vbasedev->dirty_tracking = status;
> +        }
> +    }
> +
> +out:
> +    return ret;
> +}
> +
> +static int vfio_devices_dma_logging_stop(VFIOContainer *container)
> +{
> +    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
> +                              sizeof(uint64_t))] = {};
> +    struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
> +
> +    feature->argsz = sizeof(buf);
> +    feature->flags = VFIO_DEVICE_FEATURE_SET;
> +    feature->flags |= VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
> +
> +    return vfio_devices_dma_logging_set(container, feature);
> +}
> +
> +static gboolean vfio_device_dma_logging_range_add(DMAMap *map, gpointer data)
> +{
> +    struct vfio_device_feature_dma_logging_range **out = data;
> +    struct vfio_device_feature_dma_logging_range *range = *out;
> +
> +    range->iova = map->iova;
> +    /* IOVATree is inclusive, DMA logging uAPI isn't, so add 1 to length */
> +    range->length = map->size + 1;
> +
> +    *out = ++range;
> +
> +    return false;
> +}
> +
> +static gboolean vfio_iova_tree_get_first(DMAMap *map, gpointer data)
> +{
> +    DMAMap *first = data;
> +
> +    first->iova = map->iova;
> +    first->size = map->size;
> +
> +    return true;
> +}
> +
> +static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data)
> +{
> +    DMAMap *last = data;
> +
> +    last->iova = map->iova;
> +    last->size = map->size;
> +
> +    return false;
> +}
> +
> +static struct vfio_device_feature *
> +vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
> +{
> +    struct vfio_device_feature *feature;
> +    size_t feature_size;
> +    struct vfio_device_feature_dma_logging_control *control;
> +    struct vfio_device_feature_dma_logging_range *ranges;
> +    unsigned int max_ranges;
> +    unsigned int cur_ranges;
> +
> +    feature_size = sizeof(struct vfio_device_feature) +
> +                   sizeof(struct vfio_device_feature_dma_logging_control);
> +    feature = g_malloc0(feature_size);
> +    feature->argsz = feature_size;
> +    feature->flags = VFIO_DEVICE_FEATURE_SET;
> +    feature->flags |= VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
> +
> +    control = (struct vfio_device_feature_dma_logging_control *)feature->data;
> +    control->page_size = qemu_real_host_page_size();
> +
> +    QEMU_LOCK_GUARD(&container->mappings_mutex);
> +
> +    /*
> +     * DMA logging uAPI guarantees to support at least num_ranges that fits into
> +     * a single host kernel page. To be on the safe side, use this as a limit
> +     * from which to merge to a single range.
> +     */
> +    max_ranges = qemu_real_host_page_size() / sizeof(*ranges);
> +    cur_ranges = iova_tree_nnodes(container->mappings);
> +    control->num_ranges = (cur_ranges <= max_ranges) ? cur_ranges : 1;

This makes me suspicious that we're implementing to the characteristics
of a specific device rather than strictly to the vfio migration API.
Are we just trying to avoid the error handling to support the try and
fall back to a single range behavior?  If we want to make a
simplification, then document it as such.  The "[t]o be on the safe
side" phrasing above could later be interpreted as avoiding an issue
and might discourage a more complete implementation.  Thanks,

Alex

> +    ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
> +                        control->num_ranges);
> +    if (!ranges) {
> +        g_free(feature);
> +        errno = ENOMEM;
> +
> +        return NULL;
> +    }
> +
> +    control->ranges = (uint64_t)ranges;
> +    if (cur_ranges <= max_ranges) {
> +        iova_tree_foreach(container->mappings,
> +                          vfio_device_dma_logging_range_add, &ranges);
> +    } else {
> +        DMAMap first, last;
> +
> +        iova_tree_foreach(container->mappings, vfio_iova_tree_get_first,
> +                          &first);
> +        iova_tree_foreach(container->mappings, vfio_iova_tree_get_last, &last);
> +        ranges->iova = first.iova;
> +        /* IOVATree is inclusive, DMA logging uAPI isn't, so add 1 to length */
> +        ranges->length = (last.iova - first.iova) + last.size + 1;
> +    }
> +
> +    return feature;
> +}
> +
> +static void vfio_device_feature_dma_logging_start_destroy(
> +    struct vfio_device_feature *feature)
> +{
> +    struct vfio_device_feature_dma_logging_control *control =
> +        (struct vfio_device_feature_dma_logging_control *)feature->data;
> +    struct vfio_device_feature_dma_logging_range *ranges =
> +        (struct vfio_device_feature_dma_logging_range *)control->ranges;
> +
> +    g_free(ranges);
> +    g_free(feature);
> +}
> +
> +static int vfio_devices_dma_logging_start(VFIOContainer *container)
> +{
> +    struct vfio_device_feature *feature;
> +    int ret;
> +
> +    feature = vfio_device_feature_dma_logging_start_create(container);
> +    if (!feature) {
> +        return -errno;
> +    }
> +
> +    ret = vfio_devices_dma_logging_set(container, feature);
> +    if (ret) {
> +        vfio_devices_dma_logging_stop(container);
> +    }
> +
> +    vfio_device_feature_dma_logging_start_destroy(feature);
> +
> +    return ret;
> +}
> +
>  static void vfio_listener_log_global_start(MemoryListener *listener)
>  {
>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>      int ret;
>  
> -    ret = vfio_set_dirty_page_tracking(container, true);
> +    if (vfio_devices_all_device_dirty_tracking(container)) {
> +        if (vfio_have_giommu(container)) {
> +            /* Device dirty page tracking currently doesn't support vIOMMU */
> +            return;
> +        }
> +
> +        ret = vfio_devices_dma_logging_start(container);
> +    } else {
> +        ret = vfio_set_dirty_page_tracking(container, true);
> +    }
> +
>      if (ret) {
> +        error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
> +                     ret, strerror(-ret));
>          vfio_set_migration_error(ret);
>      }
>  }
> @@ -1420,8 +1615,20 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
>      VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>      int ret;
>  
> -    ret = vfio_set_dirty_page_tracking(container, false);
> +    if (vfio_devices_all_device_dirty_tracking(container)) {
> +        if (vfio_have_giommu(container)) {
> +            /* Device dirty page tracking currently doesn't support vIOMMU */
> +            return;
> +        }
> +
> +        ret = vfio_devices_dma_logging_stop(container);
> +    } else {
> +        ret = vfio_set_dirty_page_tracking(container, false);
> +    }
> +
>      if (ret) {
> +        error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
> +                     ret, strerror(-ret));
>          vfio_set_migration_error(ret);
>      }
>  }
Jason Gunthorpe Feb. 23, 2023, 2:02 a.m. UTC | #2
On Wed, Feb 22, 2023 at 03:40:43PM -0700, Alex Williamson wrote:
> > +    /*
> > +     * DMA logging uAPI guarantees to support at least num_ranges that fits into
> > +     * a single host kernel page. To be on the safe side, use this as a limit
> > +     * from which to merge to a single range.
> > +     */
> > +    max_ranges = qemu_real_host_page_size() / sizeof(*ranges);
> > +    cur_ranges = iova_tree_nnodes(container->mappings);
> > +    control->num_ranges = (cur_ranges <= max_ranges) ? cur_ranges : 1;
> 
> This makes me suspicious that we're implementing to the characteristics
> of a specific device rather than strictly to the vfio migration API.
> Are we just trying to avoid the error handling to support the try and
> fall back to a single range behavior?

This was what we agreed to when making the kernel patches. Userspace
is restricted to send one page of range list to the kernel, and the
kernel will always adjust that to whatever smaller list the device needs.

We added this limit only because we don't want to have a way for
userspace to consume a lot of kernel memory.

See LOG_MAX_RANGES in vfio_main.c

If qemu is viommu mode and it has a huge number of ranges then it must
cut it down before passing things to the kernel.

Jason
Avihai Horon Feb. 23, 2023, 3:36 p.m. UTC | #3
On 23/02/2023 0:40, Alex Williamson wrote:
> External email: Use caution opening links or attachments
>
>
> On Wed, 22 Feb 2023 19:49:06 +0200
> Avihai Horon <avihaih@nvidia.com> wrote:
>
>> From: Joao Martins <joao.m.martins@oracle.com>
>>
>> Add device dirty page tracking start/stop functionality. This uses the
>> device DMA logging uAPI to start and stop dirty page tracking by device.
>>
>> Device dirty page tracking is used only if all devices within a
>> container support device dirty page tracking.
>>
>> Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> ---
>>   include/hw/vfio/vfio-common.h |   2 +
>>   hw/vfio/common.c              | 211 +++++++++++++++++++++++++++++++++-
>>   2 files changed, 211 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
>> index 6f36876ce0..1f21e1fa43 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -149,6 +149,8 @@ typedef struct VFIODevice {
>>       VFIOMigration *migration;
>>       Error *migration_blocker;
>>       OnOffAuto pre_copy_dirty_page_tracking;
>> +    bool dirty_pages_supported;
>> +    bool dirty_tracking;
>>   } VFIODevice;
>>
>>   struct VFIODeviceOps {
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 6041da6c7e..740153e7d7 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -473,6 +473,22 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
>>       return true;
>>   }
>>
>> +static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
>> +{
>> +    VFIOGroup *group;
>> +    VFIODevice *vbasedev;
>> +
>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>> +            if (!vbasedev->dirty_pages_supported) {
>> +                return false;
>> +            }
>> +        }
>> +    }
>> +
>> +    return true;
>> +}
>> +
>>   /*
>>    * Check if all VFIO devices are running and migration is active, which is
>>    * essentially equivalent to the migration being in pre-copy phase.
>> @@ -1404,13 +1420,192 @@ static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
>>       return ret;
>>   }
>>
>> +static int vfio_devices_dma_logging_set(VFIOContainer *container,
>> +                                        struct vfio_device_feature *feature)
>> +{
>> +    bool status = (feature->flags & VFIO_DEVICE_FEATURE_MASK) ==
>> +                  VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
>> +    VFIODevice *vbasedev;
>> +    VFIOGroup *group;
>> +    int ret = 0;
>> +
>> +    QLIST_FOREACH(group, &container->group_list, container_next) {
>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>> +            if (vbasedev->dirty_tracking == status) {
>> +                continue;
>> +            }
>> +
>> +            ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
>> +            if (ret) {
>> +                ret = -errno;
>> +                error_report("%s: Failed to set DMA logging %s, err %d (%s)",
>> +                             vbasedev->name, status ? "start" : "stop", ret,
>> +                             strerror(errno));
>> +                goto out;
>> +            }
>> +            vbasedev->dirty_tracking = status;
>> +        }
>> +    }
>> +
>> +out:
>> +    return ret;
>> +}
>> +
>> +static int vfio_devices_dma_logging_stop(VFIOContainer *container)
>> +{
>> +    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
>> +                              sizeof(uint64_t))] = {};
>> +    struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
>> +
>> +    feature->argsz = sizeof(buf);
>> +    feature->flags = VFIO_DEVICE_FEATURE_SET;
>> +    feature->flags |= VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
>> +
>> +    return vfio_devices_dma_logging_set(container, feature);
>> +}
>> +
>> +static gboolean vfio_device_dma_logging_range_add(DMAMap *map, gpointer data)
>> +{
>> +    struct vfio_device_feature_dma_logging_range **out = data;
>> +    struct vfio_device_feature_dma_logging_range *range = *out;
>> +
>> +    range->iova = map->iova;
>> +    /* IOVATree is inclusive, DMA logging uAPI isn't, so add 1 to length */
>> +    range->length = map->size + 1;
>> +
>> +    *out = ++range;
>> +
>> +    return false;
>> +}
>> +
>> +static gboolean vfio_iova_tree_get_first(DMAMap *map, gpointer data)
>> +{
>> +    DMAMap *first = data;
>> +
>> +    first->iova = map->iova;
>> +    first->size = map->size;
>> +
>> +    return true;
>> +}
>> +
>> +static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data)
>> +{
>> +    DMAMap *last = data;
>> +
>> +    last->iova = map->iova;
>> +    last->size = map->size;
>> +
>> +    return false;
>> +}
>> +
>> +static struct vfio_device_feature *
>> +vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
>> +{
>> +    struct vfio_device_feature *feature;
>> +    size_t feature_size;
>> +    struct vfio_device_feature_dma_logging_control *control;
>> +    struct vfio_device_feature_dma_logging_range *ranges;
>> +    unsigned int max_ranges;
>> +    unsigned int cur_ranges;
>> +
>> +    feature_size = sizeof(struct vfio_device_feature) +
>> +                   sizeof(struct vfio_device_feature_dma_logging_control);
>> +    feature = g_malloc0(feature_size);
>> +    feature->argsz = feature_size;
>> +    feature->flags = VFIO_DEVICE_FEATURE_SET;
>> +    feature->flags |= VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
>> +
>> +    control = (struct vfio_device_feature_dma_logging_control *)feature->data;
>> +    control->page_size = qemu_real_host_page_size();
>> +
>> +    QEMU_LOCK_GUARD(&container->mappings_mutex);
>> +
>> +    /*
>> +     * DMA logging uAPI guarantees to support at least num_ranges that fits into
>> +     * a single host kernel page. To be on the safe side, use this as a limit
>> +     * from which to merge to a single range.
>> +     */
>> +    max_ranges = qemu_real_host_page_size() / sizeof(*ranges);
>> +    cur_ranges = iova_tree_nnodes(container->mappings);
>> +    control->num_ranges = (cur_ranges <= max_ranges) ? cur_ranges : 1;
> This makes me suspicious that we're implementing to the characteristics
> of a specific device rather than strictly to the vfio migration API.
> Are we just trying to avoid the error handling to support the try and
> fall back to a single range behavior?  If we want to make a
> simplification, then document it as such.  The "[t]o be on the safe
> side" phrasing above could later be interpreted as avoiding an issue
> and might discourage a more complete implementation.

Yes, it was mainly to make things simple.
I will replace the "To be on the safe side..." phrasing.

Thanks.

>> +    ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
>> +                        control->num_ranges);
>> +    if (!ranges) {
>> +        g_free(feature);
>> +        errno = ENOMEM;
>> +
>> +        return NULL;
>> +    }
>> +
>> +    control->ranges = (uint64_t)ranges;
>> +    if (cur_ranges <= max_ranges) {
>> +        iova_tree_foreach(container->mappings,
>> +                          vfio_device_dma_logging_range_add, &ranges);
>> +    } else {
>> +        DMAMap first, last;
>> +
>> +        iova_tree_foreach(container->mappings, vfio_iova_tree_get_first,
>> +                          &first);
>> +        iova_tree_foreach(container->mappings, vfio_iova_tree_get_last, &last);
>> +        ranges->iova = first.iova;
>> +        /* IOVATree is inclusive, DMA logging uAPI isn't, so add 1 to length */
>> +        ranges->length = (last.iova - first.iova) + last.size + 1;
>> +    }
>> +
>> +    return feature;
>> +}
>> +
>> +static void vfio_device_feature_dma_logging_start_destroy(
>> +    struct vfio_device_feature *feature)
>> +{
>> +    struct vfio_device_feature_dma_logging_control *control =
>> +        (struct vfio_device_feature_dma_logging_control *)feature->data;
>> +    struct vfio_device_feature_dma_logging_range *ranges =
>> +        (struct vfio_device_feature_dma_logging_range *)control->ranges;
>> +
>> +    g_free(ranges);
>> +    g_free(feature);
>> +}
>> +
>> +static int vfio_devices_dma_logging_start(VFIOContainer *container)
>> +{
>> +    struct vfio_device_feature *feature;
>> +    int ret;
>> +
>> +    feature = vfio_device_feature_dma_logging_start_create(container);
>> +    if (!feature) {
>> +        return -errno;
>> +    }
>> +
>> +    ret = vfio_devices_dma_logging_set(container, feature);
>> +    if (ret) {
>> +        vfio_devices_dma_logging_stop(container);
>> +    }
>> +
>> +    vfio_device_feature_dma_logging_start_destroy(feature);
>> +
>> +    return ret;
>> +}
>> +
>>   static void vfio_listener_log_global_start(MemoryListener *listener)
>>   {
>>       VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>>       int ret;
>>
>> -    ret = vfio_set_dirty_page_tracking(container, true);
>> +    if (vfio_devices_all_device_dirty_tracking(container)) {
>> +        if (vfio_have_giommu(container)) {
>> +            /* Device dirty page tracking currently doesn't support vIOMMU */
>> +            return;
>> +        }
>> +
>> +        ret = vfio_devices_dma_logging_start(container);
>> +    } else {
>> +        ret = vfio_set_dirty_page_tracking(container, true);
>> +    }
>> +
>>       if (ret) {
>> +        error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
>> +                     ret, strerror(-ret));
>>           vfio_set_migration_error(ret);
>>       }
>>   }
>> @@ -1420,8 +1615,20 @@ static void vfio_listener_log_global_stop(MemoryListener *listener)
>>       VFIOContainer *container = container_of(listener, VFIOContainer, listener);
>>       int ret;
>>
>> -    ret = vfio_set_dirty_page_tracking(container, false);
>> +    if (vfio_devices_all_device_dirty_tracking(container)) {
>> +        if (vfio_have_giommu(container)) {
>> +            /* Device dirty page tracking currently doesn't support vIOMMU */
>> +            return;
>> +        }
>> +
>> +        ret = vfio_devices_dma_logging_stop(container);
>> +    } else {
>> +        ret = vfio_set_dirty_page_tracking(container, false);
>> +    }
>> +
>>       if (ret) {
>> +        error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
>> +                     ret, strerror(-ret));
>>           vfio_set_migration_error(ret);
>>       }
>>   }
Alex Williamson Feb. 23, 2023, 7:27 p.m. UTC | #4
On Wed, 22 Feb 2023 22:02:24 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Wed, Feb 22, 2023 at 03:40:43PM -0700, Alex Williamson wrote:
> > > +    /*
> > > +     * DMA logging uAPI guarantees to support at least num_ranges that fits into
> > > +     * a single host kernel page. To be on the safe side, use this as a limit
> > > +     * from which to merge to a single range.
> > > +     */
> > > +    max_ranges = qemu_real_host_page_size() / sizeof(*ranges);
> > > +    cur_ranges = iova_tree_nnodes(container->mappings);
> > > +    control->num_ranges = (cur_ranges <= max_ranges) ? cur_ranges : 1;  
> > 
> > This makes me suspicious that we're implementing to the characteristics
> > of a specific device rather than strictly to the vfio migration API.
> > Are we just trying to avoid the error handling to support the try and
> > fall back to a single range behavior?  
> 
> This was what we agreed to when making the kernel patches. Userspace
> is restricted to send one page of range list to the kernel, and the
> kernel will always adjust that to whatever smaller list the device needs.
> 
> We added this limit only because we don't want to have a way for
> userspace to consume a lot of kernel memory.
> 
> See LOG_MAX_RANGES in vfio_main.c
> 
> If qemu is viommu mode and it has a huge number of ranges then it must
> cut it down before passing things to the kernel.

Ok, that's the kernel implementation, but the uAPI states:

 * The core kernel code guarantees to support by minimum num_ranges that fit
 * into a single kernel page. User space can try higher values but should give
 * up if the above can't be achieved as of some driver limitations.

So again, I think I'm just looking for a better comment that doesn't
add FUD to the reasoning behind switching to a single range, ie. a)
it's easier to deal with given the kernel guarantee and b) the current
kernel implementation imposes a hard limit at page size anyway.  Thanks,

Alex
Jason Gunthorpe Feb. 23, 2023, 7:30 p.m. UTC | #5
On Thu, Feb 23, 2023 at 12:27:23PM -0700, Alex Williamson wrote:
> So again, I think I'm just looking for a better comment that doesn't
> add FUD to the reasoning behind switching to a single range, 

It isn't a single range, it is a single page of ranges, right?

The comment should say

"Keep the implementation simple and use at most a PAGE_SIZE of ranges
because the kernel is guaranteed to be able to parse that"

Jason
Alex Williamson Feb. 23, 2023, 8:16 p.m. UTC | #6
On Thu, 23 Feb 2023 15:30:28 -0400
Jason Gunthorpe <jgg@nvidia.com> wrote:

> On Thu, Feb 23, 2023 at 12:27:23PM -0700, Alex Williamson wrote:
> > So again, I think I'm just looking for a better comment that doesn't
> > add FUD to the reasoning behind switching to a single range,   
> 
> It isn't a single range, it is a single page of ranges, right?

Exceeding a single page of ranges is the inflection point at which we
switch to a single range.
 
> The comment should say
> 
> "Keep the implementation simple and use at most a PAGE_SIZE of ranges
> because the kernel is guaranteed to be able to parse that"

Something along those lines, yeah.  And bonus points for noting that
the kernel implementation is currently hard coded at this limit, so
there's no point in trying larger arrays as implied in the uAPI.
Thanks,

Alex
Jason Gunthorpe Feb. 23, 2023, 8:54 p.m. UTC | #7
On Thu, Feb 23, 2023 at 01:16:40PM -0700, Alex Williamson wrote:
> On Thu, 23 Feb 2023 15:30:28 -0400
> Jason Gunthorpe <jgg@nvidia.com> wrote:
> 
> > On Thu, Feb 23, 2023 at 12:27:23PM -0700, Alex Williamson wrote:
> > > So again, I think I'm just looking for a better comment that doesn't
> > > add FUD to the reasoning behind switching to a single range,   
> > 
> > It isn't a single range, it is a single page of ranges, right?
> 
> Exceeding a single page of ranges is the inflection point at which we
> switch to a single range.

Oh, that isn't what it should do - it should cut it back to fit in a
page..

Jason
Avihai Horon Feb. 26, 2023, 4:54 p.m. UTC | #8
On 23/02/2023 22:54, Jason Gunthorpe wrote:
> On Thu, Feb 23, 2023 at 01:16:40PM -0700, Alex Williamson wrote:
>> On Thu, 23 Feb 2023 15:30:28 -0400
>> Jason Gunthorpe <jgg@nvidia.com> wrote:
>>
>>> On Thu, Feb 23, 2023 at 12:27:23PM -0700, Alex Williamson wrote:
>>>> So again, I think I'm just looking for a better comment that doesn't
>>>> add FUD to the reasoning behind switching to a single range,
>>> It isn't a single range, it is a single page of ranges, right?
>> Exceeding a single page of ranges is the inflection point at which we
>> switch to a single range.
> Oh, that isn't what it should do - it should cut it back to fit in a
> page..

Sure, I will change it accordingly (and rephrase the comment).

Thanks.
diff mbox series

Patch

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 6f36876ce0..1f21e1fa43 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -149,6 +149,8 @@  typedef struct VFIODevice {
     VFIOMigration *migration;
     Error *migration_blocker;
     OnOffAuto pre_copy_dirty_page_tracking;
+    bool dirty_pages_supported;
+    bool dirty_tracking;
 } VFIODevice;
 
 struct VFIODeviceOps {
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6041da6c7e..740153e7d7 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -473,6 +473,22 @@  static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
     return true;
 }
 
+static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (!vbasedev->dirty_pages_supported) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
 /*
  * Check if all VFIO devices are running and migration is active, which is
  * essentially equivalent to the migration being in pre-copy phase.
@@ -1404,13 +1420,192 @@  static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
     return ret;
 }
 
+static int vfio_devices_dma_logging_set(VFIOContainer *container,
+                                        struct vfio_device_feature *feature)
+{
+    bool status = (feature->flags & VFIO_DEVICE_FEATURE_MASK) ==
+                  VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
+    VFIODevice *vbasedev;
+    VFIOGroup *group;
+    int ret = 0;
+
+    QLIST_FOREACH(group, &container->group_list, container_next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if (vbasedev->dirty_tracking == status) {
+                continue;
+            }
+
+            ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
+            if (ret) {
+                ret = -errno;
+                error_report("%s: Failed to set DMA logging %s, err %d (%s)",
+                             vbasedev->name, status ? "start" : "stop", ret,
+                             strerror(errno));
+                goto out;
+            }
+            vbasedev->dirty_tracking = status;
+        }
+    }
+
+out:
+    return ret;
+}
+
+static int vfio_devices_dma_logging_stop(VFIOContainer *container)
+{
+    uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
+                              sizeof(uint64_t))] = {};
+    struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
+
+    feature->argsz = sizeof(buf);
+    feature->flags = VFIO_DEVICE_FEATURE_SET;
+    feature->flags |= VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
+
+    return vfio_devices_dma_logging_set(container, feature);
+}
+
+static gboolean vfio_device_dma_logging_range_add(DMAMap *map, gpointer data)
+{
+    struct vfio_device_feature_dma_logging_range **out = data;
+    struct vfio_device_feature_dma_logging_range *range = *out;
+
+    range->iova = map->iova;
+    /* IOVATree is inclusive, DMA logging uAPI isn't, so add 1 to length */
+    range->length = map->size + 1;
+
+    *out = ++range;
+
+    return false;
+}
+
+static gboolean vfio_iova_tree_get_first(DMAMap *map, gpointer data)
+{
+    DMAMap *first = data;
+
+    first->iova = map->iova;
+    first->size = map->size;
+
+    return true;
+}
+
+static gboolean vfio_iova_tree_get_last(DMAMap *map, gpointer data)
+{
+    DMAMap *last = data;
+
+    last->iova = map->iova;
+    last->size = map->size;
+
+    return false;
+}
+
+static struct vfio_device_feature *
+vfio_device_feature_dma_logging_start_create(VFIOContainer *container)
+{
+    struct vfio_device_feature *feature;
+    size_t feature_size;
+    struct vfio_device_feature_dma_logging_control *control;
+    struct vfio_device_feature_dma_logging_range *ranges;
+    unsigned int max_ranges;
+    unsigned int cur_ranges;
+
+    feature_size = sizeof(struct vfio_device_feature) +
+                   sizeof(struct vfio_device_feature_dma_logging_control);
+    feature = g_malloc0(feature_size);
+    feature->argsz = feature_size;
+    feature->flags = VFIO_DEVICE_FEATURE_SET;
+    feature->flags |= VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
+
+    control = (struct vfio_device_feature_dma_logging_control *)feature->data;
+    control->page_size = qemu_real_host_page_size();
+
+    QEMU_LOCK_GUARD(&container->mappings_mutex);
+
+    /*
+     * DMA logging uAPI guarantees to support at least num_ranges that fits into
+     * a single host kernel page. To be on the safe side, use this as a limit
+     * from which to merge to a single range.
+     */
+    max_ranges = qemu_real_host_page_size() / sizeof(*ranges);
+    cur_ranges = iova_tree_nnodes(container->mappings);
+    control->num_ranges = (cur_ranges <= max_ranges) ? cur_ranges : 1;
+    ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
+                        control->num_ranges);
+    if (!ranges) {
+        g_free(feature);
+        errno = ENOMEM;
+
+        return NULL;
+    }
+
+    control->ranges = (uint64_t)ranges;
+    if (cur_ranges <= max_ranges) {
+        iova_tree_foreach(container->mappings,
+                          vfio_device_dma_logging_range_add, &ranges);
+    } else {
+        DMAMap first, last;
+
+        iova_tree_foreach(container->mappings, vfio_iova_tree_get_first,
+                          &first);
+        iova_tree_foreach(container->mappings, vfio_iova_tree_get_last, &last);
+        ranges->iova = first.iova;
+        /* IOVATree is inclusive, DMA logging uAPI isn't, so add 1 to length */
+        ranges->length = (last.iova - first.iova) + last.size + 1;
+    }
+
+    return feature;
+}
+
+static void vfio_device_feature_dma_logging_start_destroy(
+    struct vfio_device_feature *feature)
+{
+    struct vfio_device_feature_dma_logging_control *control =
+        (struct vfio_device_feature_dma_logging_control *)feature->data;
+    struct vfio_device_feature_dma_logging_range *ranges =
+        (struct vfio_device_feature_dma_logging_range *)control->ranges;
+
+    g_free(ranges);
+    g_free(feature);
+}
+
+static int vfio_devices_dma_logging_start(VFIOContainer *container)
+{
+    struct vfio_device_feature *feature;
+    int ret;
+
+    feature = vfio_device_feature_dma_logging_start_create(container);
+    if (!feature) {
+        return -errno;
+    }
+
+    ret = vfio_devices_dma_logging_set(container, feature);
+    if (ret) {
+        vfio_devices_dma_logging_stop(container);
+    }
+
+    vfio_device_feature_dma_logging_start_destroy(feature);
+
+    return ret;
+}
+
 static void vfio_listener_log_global_start(MemoryListener *listener)
 {
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
     int ret;
 
-    ret = vfio_set_dirty_page_tracking(container, true);
+    if (vfio_devices_all_device_dirty_tracking(container)) {
+        if (vfio_have_giommu(container)) {
+            /* Device dirty page tracking currently doesn't support vIOMMU */
+            return;
+        }
+
+        ret = vfio_devices_dma_logging_start(container);
+    } else {
+        ret = vfio_set_dirty_page_tracking(container, true);
+    }
+
     if (ret) {
+        error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
+                     ret, strerror(-ret));
         vfio_set_migration_error(ret);
     }
 }
@@ -1420,8 +1615,20 @@  static void vfio_listener_log_global_stop(MemoryListener *listener)
     VFIOContainer *container = container_of(listener, VFIOContainer, listener);
     int ret;
 
-    ret = vfio_set_dirty_page_tracking(container, false);
+    if (vfio_devices_all_device_dirty_tracking(container)) {
+        if (vfio_have_giommu(container)) {
+            /* Device dirty page tracking currently doesn't support vIOMMU */
+            return;
+        }
+
+        ret = vfio_devices_dma_logging_stop(container);
+    } else {
+        ret = vfio_set_dirty_page_tracking(container, false);
+    }
+
     if (ret) {
+        error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
+                     ret, strerror(-ret));
         vfio_set_migration_error(ret);
     }
 }