diff mbox series

[QEMU,v25,15/17] vfio: Add ioctl to get dirty pages bitmap during dma unmap.

Message ID 1592684486-18511-16-git-send-email-kwankhede@nvidia.com
State New
Headers show
Series Add migration support for VFIO devices | expand

Commit Message

Kirti Wankhede June 20, 2020, 8:21 p.m. UTC
With vIOMMU, IO virtual address range can get unmapped while in pre-copy
phase of migration. In that case, unmap ioctl should return pages pinned
in that range and QEMU should find its correcponding guest physical
addresses and report those dirty.

Suggested-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Reviewed-by: Neo Jia <cjia@nvidia.com>
---
 hw/vfio/common.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 4 deletions(-)

Comments

Cornelia Huck June 23, 2020, 8:25 a.m. UTC | #1
On Sun, 21 Jun 2020 01:51:24 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> With vIOMMU, IO virtual address range can get unmapped while in pre-copy
> phase of migration. In that case, unmap ioctl should return pages pinned
> in that range and QEMU should find its correcponding guest physical
> addresses and report those dirty.
> 
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  hw/vfio/common.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 81 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 0518cf228ed5..a06b8f2f66e2 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -311,11 +311,83 @@ static bool vfio_devices_are_stopped_and_saving(void)
>      return true;
>  }
>  
> +static bool vfio_devices_are_running_and_saving(void)

I previously asked:
(https://lore.kernel.org/qemu-devel/20200506123125.449dbf42.cohuck@redhat.com/)

"Maybe s/are/all/ to make it sure that the scope is *all* vfio devices
here?

Is there any global state for this which we could use to check this in
a simpler way?"

Any comment?

> +{
> +    VFIOGroup *group;
> +    VFIODevice *vbasedev;
> +
> +    QLIST_FOREACH(group, &vfio_group_list, next) {
> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
> +                (vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
> +                continue;
> +            } else {
> +                return false;
> +            }
> +        }
> +    }
> +    return true;
> +}
Alex Williamson June 24, 2020, 6:56 p.m. UTC | #2
On Sun, 21 Jun 2020 01:51:24 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> With vIOMMU, IO virtual address range can get unmapped while in pre-copy
> phase of migration. In that case, unmap ioctl should return pages pinned
> in that range and QEMU should find its correcponding guest physical
> addresses and report those dirty.
> 
> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Reviewed-by: Neo Jia <cjia@nvidia.com>
> ---
>  hw/vfio/common.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 81 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 0518cf228ed5..a06b8f2f66e2 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -311,11 +311,83 @@ static bool vfio_devices_are_stopped_and_saving(void)
>      return true;
>  }
>  
> +static bool vfio_devices_are_running_and_saving(void)
> +{
> +    VFIOGroup *group;
> +    VFIODevice *vbasedev;
> +
> +    QLIST_FOREACH(group, &vfio_group_list, next) {

Same as previous, I'm curious if we should instead be looking at
container granularity.  It especially seems to make sense here where
we're unmapping from a container, so iterating every device in every
group seems excessive.

> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
> +                (vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
> +                continue;
> +            } else {
> +                return false;
> +            }

I'm also not sure about the polarity of this function, should it be if
any device is _SAVING we should report the dirty bitmap?  For example,
what if we have a set of paried failover NICs where we intend to unplug
one just prior to stopping the devices, aren't we going to lose dirtied
pages with this logic that they all must be running and saving?  Thanks,

Alex

> +        }
> +    }
> +    return true;
> +}
> +
> +static int vfio_dma_unmap_bitmap(VFIOContainer *container,
> +                                 hwaddr iova, ram_addr_t size,
> +                                 IOMMUTLBEntry *iotlb)
> +{
> +    struct vfio_iommu_type1_dma_unmap *unmap;
> +    struct vfio_bitmap *bitmap;
> +    uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
> +    int ret;
> +
> +    unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
> +
> +    unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
> +    unmap->iova = iova;
> +    unmap->size = size;
> +    unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
> +    bitmap = (struct vfio_bitmap *)&unmap->data;
> +
> +    /*
> +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
> +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
> +     * TARGET_PAGE_SIZE.
> +     */
> +
> +    bitmap->pgsize = TARGET_PAGE_SIZE;
> +    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
> +                   BITS_PER_BYTE;
> +
> +    if (bitmap->size > container->max_dirty_bitmap_size) {
> +        error_report("UNMAP: Size of bitmap too big 0x%llx", bitmap->size);
> +        ret = -E2BIG;
> +        goto unmap_exit;
> +    }
> +
> +    bitmap->data = g_try_malloc0(bitmap->size);
> +    if (!bitmap->data) {
> +        ret = -ENOMEM;
> +        goto unmap_exit;
> +    }
> +
> +    ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
> +    if (!ret) {
> +        cpu_physical_memory_set_dirty_lebitmap((uint64_t *)bitmap->data,
> +                iotlb->translated_addr, pages);
> +    } else {
> +        error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
> +    }
> +
> +    g_free(bitmap->data);
> +unmap_exit:
> +    g_free(unmap);
> +    return ret;
> +}
> +
>  /*
>   * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>   */
>  static int vfio_dma_unmap(VFIOContainer *container,
> -                          hwaddr iova, ram_addr_t size)
> +                          hwaddr iova, ram_addr_t size,
> +                          IOMMUTLBEntry *iotlb)
>  {
>      struct vfio_iommu_type1_dma_unmap unmap = {
>          .argsz = sizeof(unmap),
> @@ -324,6 +396,11 @@ static int vfio_dma_unmap(VFIOContainer *container,
>          .size = size,
>      };
>  
> +    if (iotlb && container->dirty_pages_supported &&
> +        vfio_devices_are_running_and_saving()) {
> +        return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
> +    }
> +
>      while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
>          /*
>           * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
> @@ -371,7 +448,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>       * the VGA ROM space.
>       */
>      if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
> -        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
> +        (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
>           ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
>          return 0;
>      }
> @@ -542,7 +619,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>              }
>          }
>  
> -        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
> +        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
>          if (ret) {
>              error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>                           "0x%"HWADDR_PRIx") = %d (%m)",
> @@ -853,7 +930,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
>      }
>  
>      if (try_unmap) {
> -        ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
> +        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
>          if (ret) {
>              error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>                           "0x%"HWADDR_PRIx") = %d (%m)",
Kirti Wankhede June 25, 2020, 3:01 p.m. UTC | #3
On 6/25/2020 12:26 AM, Alex Williamson wrote:
> On Sun, 21 Jun 2020 01:51:24 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> With vIOMMU, IO virtual address range can get unmapped while in pre-copy
>> phase of migration. In that case, unmap ioctl should return pages pinned
>> in that range and QEMU should find its correcponding guest physical
>> addresses and report those dirty.
>>
>> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
>> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
>> Reviewed-by: Neo Jia <cjia@nvidia.com>
>> ---
>>   hw/vfio/common.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 81 insertions(+), 4 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 0518cf228ed5..a06b8f2f66e2 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -311,11 +311,83 @@ static bool vfio_devices_are_stopped_and_saving(void)
>>       return true;
>>   }
>>   
>> +static bool vfio_devices_are_running_and_saving(void)
>> +{
>> +    VFIOGroup *group;
>> +    VFIODevice *vbasedev;
>> +
>> +    QLIST_FOREACH(group, &vfio_group_list, next) {
> 
> Same as previous, I'm curious if we should instead be looking at
> container granularity.  It especially seems to make sense here where
> we're unmapping from a container, so iterating every device in every
> group seems excessive.
> 

changing it with container argument.

>> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
>> +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
>> +                (vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
>> +                continue;
>> +            } else {
>> +                return false;
>> +            }
> 
> I'm also not sure about the polarity of this function, should it be if
> any device is _SAVING we should report the dirty bitmap?  For example,
> what if we have a set of paried failover NICs where we intend to unplug
> one just prior to stopping the devices, aren't we going to lose dirtied
> pages with this logic that they all must be running and saving?  Thanks,
> 

If migration is initiated, is device unplug allowed? Ideally it 
shouldn't. If it is, then how QEMU handles data stream of device which 
doesn't exist at destination?

_SAVING flag is set during pre-copy and stop-and-copy phase. Here we 
only want to track pages which are unmapped during pre-copy phase, i.e. 
when vCPU are running. In case of VM suspend /saveVM, there is no 
pre-copy phase, but ideally we shouldn't see unmaps when vCPUs are 
stopped, right? But still for safer side, since we know exact phase, I 
would prefer to check for _SAVING and _RUNNING flags.

Thanks,
Kirti


> Alex
> 
>> +        }
>> +    }
>> +    return true;
>> +}
>> +
>> +static int vfio_dma_unmap_bitmap(VFIOContainer *container,
>> +                                 hwaddr iova, ram_addr_t size,
>> +                                 IOMMUTLBEntry *iotlb)
>> +{
>> +    struct vfio_iommu_type1_dma_unmap *unmap;
>> +    struct vfio_bitmap *bitmap;
>> +    uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
>> +    int ret;
>> +
>> +    unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
>> +
>> +    unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
>> +    unmap->iova = iova;
>> +    unmap->size = size;
>> +    unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
>> +    bitmap = (struct vfio_bitmap *)&unmap->data;
>> +
>> +    /*
>> +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
>> +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
>> +     * TARGET_PAGE_SIZE.
>> +     */
>> +
>> +    bitmap->pgsize = TARGET_PAGE_SIZE;
>> +    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
>> +                   BITS_PER_BYTE;
>> +
>> +    if (bitmap->size > container->max_dirty_bitmap_size) {
>> +        error_report("UNMAP: Size of bitmap too big 0x%llx", bitmap->size);
>> +        ret = -E2BIG;
>> +        goto unmap_exit;
>> +    }
>> +
>> +    bitmap->data = g_try_malloc0(bitmap->size);
>> +    if (!bitmap->data) {
>> +        ret = -ENOMEM;
>> +        goto unmap_exit;
>> +    }
>> +
>> +    ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
>> +    if (!ret) {
>> +        cpu_physical_memory_set_dirty_lebitmap((uint64_t *)bitmap->data,
>> +                iotlb->translated_addr, pages);
>> +    } else {
>> +        error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
>> +    }
>> +
>> +    g_free(bitmap->data);
>> +unmap_exit:
>> +    g_free(unmap);
>> +    return ret;
>> +}
>> +
>>   /*
>>    * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
>>    */
>>   static int vfio_dma_unmap(VFIOContainer *container,
>> -                          hwaddr iova, ram_addr_t size)
>> +                          hwaddr iova, ram_addr_t size,
>> +                          IOMMUTLBEntry *iotlb)
>>   {
>>       struct vfio_iommu_type1_dma_unmap unmap = {
>>           .argsz = sizeof(unmap),
>> @@ -324,6 +396,11 @@ static int vfio_dma_unmap(VFIOContainer *container,
>>           .size = size,
>>       };
>>   
>> +    if (iotlb && container->dirty_pages_supported &&
>> +        vfio_devices_are_running_and_saving()) {
>> +        return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
>> +    }
>> +
>>       while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
>>           /*
>>            * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
>> @@ -371,7 +448,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
>>        * the VGA ROM space.
>>        */
>>       if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
>> -        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
>> +        (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
>>            ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
>>           return 0;
>>       }
>> @@ -542,7 +619,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
>>               }
>>           }
>>   
>> -        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
>> +        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
>>           if (ret) {
>>               error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>>                            "0x%"HWADDR_PRIx") = %d (%m)",
>> @@ -853,7 +930,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
>>       }
>>   
>>       if (try_unmap) {
>> -        ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
>> +        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
>>           if (ret) {
>>               error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
>>                            "0x%"HWADDR_PRIx") = %d (%m)",
>
Alex Williamson June 25, 2020, 7:18 p.m. UTC | #4
On Thu, 25 Jun 2020 20:31:12 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 6/25/2020 12:26 AM, Alex Williamson wrote:
> > On Sun, 21 Jun 2020 01:51:24 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> With vIOMMU, IO virtual address range can get unmapped while in pre-copy
> >> phase of migration. In that case, unmap ioctl should return pages pinned
> >> in that range and QEMU should find its correcponding guest physical
> >> addresses and report those dirty.
> >>
> >> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> >> ---
> >>   hw/vfio/common.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >>   1 file changed, 81 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >> index 0518cf228ed5..a06b8f2f66e2 100644
> >> --- a/hw/vfio/common.c
> >> +++ b/hw/vfio/common.c
> >> @@ -311,11 +311,83 @@ static bool vfio_devices_are_stopped_and_saving(void)
> >>       return true;
> >>   }
> >>   
> >> +static bool vfio_devices_are_running_and_saving(void)
> >> +{
> >> +    VFIOGroup *group;
> >> +    VFIODevice *vbasedev;
> >> +
> >> +    QLIST_FOREACH(group, &vfio_group_list, next) {  
> > 
> > Same as previous, I'm curious if we should instead be looking at
> > container granularity.  It especially seems to make sense here where
> > we're unmapping from a container, so iterating every device in every
> > group seems excessive.
> >   
> 
> changing it with container argument.
> 
> >> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> >> +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
> >> +                (vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
> >> +                continue;
> >> +            } else {
> >> +                return false;
> >> +            }  
> > 
> > I'm also not sure about the polarity of this function, should it be if
> > any device is _SAVING we should report the dirty bitmap?  For example,
> > what if we have a set of paried failover NICs where we intend to unplug
> > one just prior to stopping the devices, aren't we going to lose dirtied
> > pages with this logic that they all must be running and saving?  Thanks,
> >   
> 
> If migration is initiated, is device unplug allowed? Ideally it 
> shouldn't. If it is, then how QEMU handles data stream of device which 
> doesn't exist at destination?

include/hw/qdev-core.h
struct DeviceState {
    ...
    bool allow_unplug_during_migration;

AIUI, the failover_pair_id device is likely to be a vfio-pci NIC,
otherwise they'd simply migrate the primary NIC, so there's a very good
chance that a user would configure a VM with a migratable mdev device
and an failover NIC so that they have high speed networking on either
end of the migration.
 
> _SAVING flag is set during pre-copy and stop-and-copy phase. Here we 
> only want to track pages which are unmapped during pre-copy phase, i.e. 
> when vCPU are running. In case of VM suspend /saveVM, there is no 
> pre-copy phase, but ideally we shouldn't see unmaps when vCPUs are 
> stopped, right? But still for safer side, since we know exact phase, I 
> would prefer to check for _SAVING and _RUNNING flags.

We can't have unmaps while vCPUs are stopped, but I think the failover
code allows that we can be in the pre-copy phase where not all devices
support migration.  As coded here, it appears that dirty tracking of any
unmap while in that phase is lost.  Thanks,

Alex


> >> +        }
> >> +    }
> >> +    return true;
> >> +}
> >> +
> >> +static int vfio_dma_unmap_bitmap(VFIOContainer *container,
> >> +                                 hwaddr iova, ram_addr_t size,
> >> +                                 IOMMUTLBEntry *iotlb)
> >> +{
> >> +    struct vfio_iommu_type1_dma_unmap *unmap;
> >> +    struct vfio_bitmap *bitmap;
> >> +    uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
> >> +    int ret;
> >> +
> >> +    unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
> >> +
> >> +    unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
> >> +    unmap->iova = iova;
> >> +    unmap->size = size;
> >> +    unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
> >> +    bitmap = (struct vfio_bitmap *)&unmap->data;
> >> +
> >> +    /*
> >> +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
> >> +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
> >> +     * TARGET_PAGE_SIZE.
> >> +     */
> >> +
> >> +    bitmap->pgsize = TARGET_PAGE_SIZE;
> >> +    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
> >> +                   BITS_PER_BYTE;
> >> +
> >> +    if (bitmap->size > container->max_dirty_bitmap_size) {
> >> +        error_report("UNMAP: Size of bitmap too big 0x%llx", bitmap->size);
> >> +        ret = -E2BIG;
> >> +        goto unmap_exit;
> >> +    }
> >> +
> >> +    bitmap->data = g_try_malloc0(bitmap->size);
> >> +    if (!bitmap->data) {
> >> +        ret = -ENOMEM;
> >> +        goto unmap_exit;
> >> +    }
> >> +
> >> +    ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
> >> +    if (!ret) {
> >> +        cpu_physical_memory_set_dirty_lebitmap((uint64_t *)bitmap->data,
> >> +                iotlb->translated_addr, pages);
> >> +    } else {
> >> +        error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
> >> +    }
> >> +
> >> +    g_free(bitmap->data);
> >> +unmap_exit:
> >> +    g_free(unmap);
> >> +    return ret;
> >> +}
> >> +
> >>   /*
> >>    * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> >>    */
> >>   static int vfio_dma_unmap(VFIOContainer *container,
> >> -                          hwaddr iova, ram_addr_t size)
> >> +                          hwaddr iova, ram_addr_t size,
> >> +                          IOMMUTLBEntry *iotlb)
> >>   {
> >>       struct vfio_iommu_type1_dma_unmap unmap = {
> >>           .argsz = sizeof(unmap),
> >> @@ -324,6 +396,11 @@ static int vfio_dma_unmap(VFIOContainer *container,
> >>           .size = size,
> >>       };
> >>   
> >> +    if (iotlb && container->dirty_pages_supported &&
> >> +        vfio_devices_are_running_and_saving()) {
> >> +        return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
> >> +    }
> >> +
> >>       while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> >>           /*
> >>            * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
> >> @@ -371,7 +448,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> >>        * the VGA ROM space.
> >>        */
> >>       if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
> >> -        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
> >> +        (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
> >>            ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
> >>           return 0;
> >>       }
> >> @@ -542,7 +619,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> >>               }
> >>           }
> >>   
> >> -        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
> >> +        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
> >>           if (ret) {
> >>               error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> >>                            "0x%"HWADDR_PRIx") = %d (%m)",
> >> @@ -853,7 +930,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
> >>       }
> >>   
> >>       if (try_unmap) {
> >> -        ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
> >> +        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
> >>           if (ret) {
> >>               error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> >>                            "0x%"HWADDR_PRIx") = %d (%m)",  
> >   
>
Dr. David Alan Gilbert June 26, 2020, 2:15 p.m. UTC | #5
* Alex Williamson (alex.williamson@redhat.com) wrote:
> On Thu, 25 Jun 2020 20:31:12 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > On 6/25/2020 12:26 AM, Alex Williamson wrote:
> > > On Sun, 21 Jun 2020 01:51:24 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > >   
> > >> With vIOMMU, IO virtual address range can get unmapped while in pre-copy
> > >> phase of migration. In that case, unmap ioctl should return pages pinned
> > >> in that range and QEMU should find its correcponding guest physical
> > >> addresses and report those dirty.
> > >>
> > >> Suggested-by: Alex Williamson <alex.williamson@redhat.com>
> > >> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> > >> Reviewed-by: Neo Jia <cjia@nvidia.com>
> > >> ---
> > >>   hw/vfio/common.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
> > >>   1 file changed, 81 insertions(+), 4 deletions(-)
> > >>
> > >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> > >> index 0518cf228ed5..a06b8f2f66e2 100644
> > >> --- a/hw/vfio/common.c
> > >> +++ b/hw/vfio/common.c
> > >> @@ -311,11 +311,83 @@ static bool vfio_devices_are_stopped_and_saving(void)
> > >>       return true;
> > >>   }
> > >>   
> > >> +static bool vfio_devices_are_running_and_saving(void)
> > >> +{
> > >> +    VFIOGroup *group;
> > >> +    VFIODevice *vbasedev;
> > >> +
> > >> +    QLIST_FOREACH(group, &vfio_group_list, next) {  
> > > 
> > > Same as previous, I'm curious if we should instead be looking at
> > > container granularity.  It especially seems to make sense here where
> > > we're unmapping from a container, so iterating every device in every
> > > group seems excessive.
> > >   
> > 
> > changing it with container argument.
> > 
> > >> +        QLIST_FOREACH(vbasedev, &group->device_list, next) {
> > >> +            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
> > >> +                (vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
> > >> +                continue;
> > >> +            } else {
> > >> +                return false;
> > >> +            }  
> > > 
> > > I'm also not sure about the polarity of this function, should it be if
> > > any device is _SAVING we should report the dirty bitmap?  For example,
> > > what if we have a set of paried failover NICs where we intend to unplug
> > > one just prior to stopping the devices, aren't we going to lose dirtied
> > > pages with this logic that they all must be running and saving?  Thanks,
> > >   
> > 
> > If migration is initiated, is device unplug allowed? Ideally it 
> > shouldn't. If it is, then how QEMU handles data stream of device which 
> > doesn't exist at destination?
> 
> include/hw/qdev-core.h
> struct DeviceState {
>     ...
>     bool allow_unplug_during_migration;
> 
> AIUI, the failover_pair_id device is likely to be a vfio-pci NIC,
> otherwise they'd simply migrate the primary NIC, so there's a very good
> chance that a user would configure a VM with a migratable mdev device
> and an failover NIC so that they have high speed networking on either
> end of the migration.

My understanding for that failover code is that happens right at the
beginning of migration while we're still in MIGRATION_STATUS_SETUP;
whether there's anything that enforces that is a different matter.
But, in that case, I don't think you'd be interested in that dirtying.

Dave

> > _SAVING flag is set during pre-copy and stop-and-copy phase. Here we 
> > only want to track pages which are unmapped during pre-copy phase, i.e. 
> > when vCPU are running. In case of VM suspend /saveVM, there is no 
> > pre-copy phase, but ideally we shouldn't see unmaps when vCPUs are 
> > stopped, right? But still for safer side, since we know exact phase, I 
> > would prefer to check for _SAVING and _RUNNING flags.
> 
> We can't have unmaps while vCPUs are stopped, but I think the failover
> code allows that we can be in the pre-copy phase where not all devices
> support migration.  As coded here, it appears that dirty tracking of any
> unmap while in that phase is lost.  Thanks,
> 
> Alex
> 
> 
> > >> +        }
> > >> +    }
> > >> +    return true;
> > >> +}
> > >> +
> > >> +static int vfio_dma_unmap_bitmap(VFIOContainer *container,
> > >> +                                 hwaddr iova, ram_addr_t size,
> > >> +                                 IOMMUTLBEntry *iotlb)
> > >> +{
> > >> +    struct vfio_iommu_type1_dma_unmap *unmap;
> > >> +    struct vfio_bitmap *bitmap;
> > >> +    uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
> > >> +    int ret;
> > >> +
> > >> +    unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
> > >> +
> > >> +    unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
> > >> +    unmap->iova = iova;
> > >> +    unmap->size = size;
> > >> +    unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
> > >> +    bitmap = (struct vfio_bitmap *)&unmap->data;
> > >> +
> > >> +    /*
> > >> +     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
> > >> +     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
> > >> +     * TARGET_PAGE_SIZE.
> > >> +     */
> > >> +
> > >> +    bitmap->pgsize = TARGET_PAGE_SIZE;
> > >> +    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
> > >> +                   BITS_PER_BYTE;
> > >> +
> > >> +    if (bitmap->size > container->max_dirty_bitmap_size) {
> > >> +        error_report("UNMAP: Size of bitmap too big 0x%llx", bitmap->size);
> > >> +        ret = -E2BIG;
> > >> +        goto unmap_exit;
> > >> +    }
> > >> +
> > >> +    bitmap->data = g_try_malloc0(bitmap->size);
> > >> +    if (!bitmap->data) {
> > >> +        ret = -ENOMEM;
> > >> +        goto unmap_exit;
> > >> +    }
> > >> +
> > >> +    ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
> > >> +    if (!ret) {
> > >> +        cpu_physical_memory_set_dirty_lebitmap((uint64_t *)bitmap->data,
> > >> +                iotlb->translated_addr, pages);
> > >> +    } else {
> > >> +        error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
> > >> +    }
> > >> +
> > >> +    g_free(bitmap->data);
> > >> +unmap_exit:
> > >> +    g_free(unmap);
> > >> +    return ret;
> > >> +}
> > >> +
> > >>   /*
> > >>    * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
> > >>    */
> > >>   static int vfio_dma_unmap(VFIOContainer *container,
> > >> -                          hwaddr iova, ram_addr_t size)
> > >> +                          hwaddr iova, ram_addr_t size,
> > >> +                          IOMMUTLBEntry *iotlb)
> > >>   {
> > >>       struct vfio_iommu_type1_dma_unmap unmap = {
> > >>           .argsz = sizeof(unmap),
> > >> @@ -324,6 +396,11 @@ static int vfio_dma_unmap(VFIOContainer *container,
> > >>           .size = size,
> > >>       };
> > >>   
> > >> +    if (iotlb && container->dirty_pages_supported &&
> > >> +        vfio_devices_are_running_and_saving()) {
> > >> +        return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
> > >> +    }
> > >> +
> > >>       while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
> > >>           /*
> > >>            * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
> > >> @@ -371,7 +448,7 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
> > >>        * the VGA ROM space.
> > >>        */
> > >>       if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
> > >> -        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
> > >> +        (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
> > >>            ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
> > >>           return 0;
> > >>       }
> > >> @@ -542,7 +619,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
> > >>               }
> > >>           }
> > >>   
> > >> -        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
> > >> +        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
> > >>           if (ret) {
> > >>               error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> > >>                            "0x%"HWADDR_PRIx") = %d (%m)",
> > >> @@ -853,7 +930,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
> > >>       }
> > >>   
> > >>       if (try_unmap) {
> > >> -        ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
> > >> +        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
> > >>           if (ret) {
> > >>               error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
> > >>                            "0x%"HWADDR_PRIx") = %d (%m)",  
> > >   
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 0518cf228ed5..a06b8f2f66e2 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -311,11 +311,83 @@  static bool vfio_devices_are_stopped_and_saving(void)
     return true;
 }
 
+static bool vfio_devices_are_running_and_saving(void)
+{
+    VFIOGroup *group;
+    VFIODevice *vbasedev;
+
+    QLIST_FOREACH(group, &vfio_group_list, next) {
+        QLIST_FOREACH(vbasedev, &group->device_list, next) {
+            if ((vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) &&
+                (vbasedev->device_state & VFIO_DEVICE_STATE_RUNNING)) {
+                continue;
+            } else {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+static int vfio_dma_unmap_bitmap(VFIOContainer *container,
+                                 hwaddr iova, ram_addr_t size,
+                                 IOMMUTLBEntry *iotlb)
+{
+    struct vfio_iommu_type1_dma_unmap *unmap;
+    struct vfio_bitmap *bitmap;
+    uint64_t pages = TARGET_PAGE_ALIGN(size) >> TARGET_PAGE_BITS;
+    int ret;
+
+    unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
+
+    unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
+    unmap->iova = iova;
+    unmap->size = size;
+    unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
+    bitmap = (struct vfio_bitmap *)&unmap->data;
+
+    /*
+     * cpu_physical_memory_set_dirty_lebitmap() expects pages in bitmap of
+     * TARGET_PAGE_SIZE to mark those dirty. Hence set bitmap_pgsize to
+     * TARGET_PAGE_SIZE.
+     */
+
+    bitmap->pgsize = TARGET_PAGE_SIZE;
+    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
+                   BITS_PER_BYTE;
+
+    if (bitmap->size > container->max_dirty_bitmap_size) {
+        error_report("UNMAP: Size of bitmap too big 0x%llx", bitmap->size);
+        ret = -E2BIG;
+        goto unmap_exit;
+    }
+
+    bitmap->data = g_try_malloc0(bitmap->size);
+    if (!bitmap->data) {
+        ret = -ENOMEM;
+        goto unmap_exit;
+    }
+
+    ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
+    if (!ret) {
+        cpu_physical_memory_set_dirty_lebitmap((uint64_t *)bitmap->data,
+                iotlb->translated_addr, pages);
+    } else {
+        error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
+    }
+
+    g_free(bitmap->data);
+unmap_exit:
+    g_free(unmap);
+    return ret;
+}
+
 /*
  * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
  */
 static int vfio_dma_unmap(VFIOContainer *container,
-                          hwaddr iova, ram_addr_t size)
+                          hwaddr iova, ram_addr_t size,
+                          IOMMUTLBEntry *iotlb)
 {
     struct vfio_iommu_type1_dma_unmap unmap = {
         .argsz = sizeof(unmap),
@@ -324,6 +396,11 @@  static int vfio_dma_unmap(VFIOContainer *container,
         .size = size,
     };
 
+    if (iotlb && container->dirty_pages_supported &&
+        vfio_devices_are_running_and_saving()) {
+        return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
+    }
+
     while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
         /*
          * The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
@@ -371,7 +448,7 @@  static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
      * the VGA ROM space.
      */
     if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
-        (errno == EBUSY && vfio_dma_unmap(container, iova, size) == 0 &&
+        (errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
          ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
         return 0;
     }
@@ -542,7 +619,7 @@  static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
             }
         }
 
-        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1);
+        ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx") = %d (%m)",
@@ -853,7 +930,7 @@  static void vfio_listener_region_del(MemoryListener *listener,
     }
 
     if (try_unmap) {
-        ret = vfio_dma_unmap(container, iova, int128_get64(llsize));
+        ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
         if (ret) {
             error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
                          "0x%"HWADDR_PRIx") = %d (%m)",