diff mbox series

[v2,07/20] vfio/common: Add VFIOBitmap and (de)alloc functions

Message ID 20230222174915.5647-8-avihaih@nvidia.com
State New
Headers show
Series vfio: Add migration pre-copy support and device dirty tracking | expand

Commit Message

Avihai Horon Feb. 22, 2023, 5:49 p.m. UTC
There are already two places where dirty page bitmap allocation and
calculations are done in open code. With device dirty page tracking
being added in next patches, there are going to be even more places.

To avoid code duplication, introduce VFIOBitmap struct and corresponding
alloc and dealloc functions and use them where applicable.

Signed-off-by: Avihai Horon <avihaih@nvidia.com>
---
 hw/vfio/common.c | 89 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 60 insertions(+), 29 deletions(-)

Comments

Alex Williamson Feb. 22, 2023, 9:40 p.m. UTC | #1
On Wed, 22 Feb 2023 19:49:02 +0200
Avihai Horon <avihaih@nvidia.com> wrote:

> There are already two places where dirty page bitmap allocation and
> calculations are done in open code. With device dirty page tracking
> being added in next patches, there are going to be even more places.
> 
> To avoid code duplication, introduce VFIOBitmap struct and corresponding
> alloc and dealloc functions and use them where applicable.
> 
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
>  hw/vfio/common.c | 89 ++++++++++++++++++++++++++++++++----------------
>  1 file changed, 60 insertions(+), 29 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ac93b85632..84f08bdbbb 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -320,6 +320,41 @@ const MemoryRegionOps vfio_region_ops = {
>   * Device state interfaces
>   */
>  
> +typedef struct {
> +    unsigned long *bitmap;
> +    hwaddr size;
> +    hwaddr pages;
> +} VFIOBitmap;
> +
> +static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
> +{
> +    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);
> +    if (!vbmap) {
> +        errno = ENOMEM;
> +
> +        return NULL;
> +    }
> +
> +    vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
> +    vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
> +                                         BITS_PER_BYTE;
> +    vbmap->bitmap = g_try_malloc0(vbmap->size);
> +    if (!vbmap->bitmap) {
> +        g_free(vbmap);
> +        errno = ENOMEM;
> +
> +        return NULL;
> +    }
> +
> +    return vbmap;
> +}
> +
> +static void vfio_bitmap_dealloc(VFIOBitmap *vbmap)
> +{
> +    g_free(vbmap->bitmap);
> +    g_free(vbmap);
> +}

Nit, '_alloc' and '_free' seems like a more standard convention.
Thanks,

Alex
Avihai Horon Feb. 23, 2023, 3:27 p.m. UTC | #2
On 22/02/2023 23:40, Alex Williamson wrote:
> External email: Use caution opening links or attachments
>
>
> On Wed, 22 Feb 2023 19:49:02 +0200
> Avihai Horon <avihaih@nvidia.com> wrote:
>
>> There are already two places where dirty page bitmap allocation and
>> calculations are done in open code. With device dirty page tracking
>> being added in next patches, there are going to be even more places.
>>
>> To avoid code duplication, introduce VFIOBitmap struct and corresponding
>> alloc and dealloc functions and use them where applicable.
>>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> ---
>>   hw/vfio/common.c | 89 ++++++++++++++++++++++++++++++++----------------
>>   1 file changed, 60 insertions(+), 29 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index ac93b85632..84f08bdbbb 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -320,6 +320,41 @@ const MemoryRegionOps vfio_region_ops = {
>>    * Device state interfaces
>>    */
>>
>> +typedef struct {
>> +    unsigned long *bitmap;
>> +    hwaddr size;
>> +    hwaddr pages;
>> +} VFIOBitmap;
>> +
>> +static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
>> +{
>> +    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);
>> +    if (!vbmap) {
>> +        errno = ENOMEM;
>> +
>> +        return NULL;
>> +    }
>> +
>> +    vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
>> +    vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
>> +                                         BITS_PER_BYTE;
>> +    vbmap->bitmap = g_try_malloc0(vbmap->size);
>> +    if (!vbmap->bitmap) {
>> +        g_free(vbmap);
>> +        errno = ENOMEM;
>> +
>> +        return NULL;
>> +    }
>> +
>> +    return vbmap;
>> +}
>> +
>> +static void vfio_bitmap_dealloc(VFIOBitmap *vbmap)
>> +{
>> +    g_free(vbmap->bitmap);
>> +    g_free(vbmap);
>> +}
> Nit, '_alloc' and '_free' seems like a more standard convention.

Sure, will change.

Thanks.
Cédric Le Goater Feb. 27, 2023, 2:09 p.m. UTC | #3
On 2/22/23 18:49, Avihai Horon wrote:
> There are already two places where dirty page bitmap allocation and
> calculations are done in open code. With device dirty page tracking
> being added in next patches, there are going to be even more places.
> 
> To avoid code duplication, introduce VFIOBitmap struct and corresponding
> alloc and dealloc functions and use them where applicable.
> 
> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
> ---
>   hw/vfio/common.c | 89 ++++++++++++++++++++++++++++++++----------------
>   1 file changed, 60 insertions(+), 29 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index ac93b85632..84f08bdbbb 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -320,6 +320,41 @@ const MemoryRegionOps vfio_region_ops = {
>    * Device state interfaces
>    */
>   
> +typedef struct {
> +    unsigned long *bitmap;
> +    hwaddr size;
> +    hwaddr pages;
> +} VFIOBitmap;
> +
> +static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
> +{
> +    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);

I think using g_malloc0() for the VFIOBitmap should be fine. If QEMU can
not allocate a couple of bytes, we are in trouble anyway.

Thanks,

C.


> +    if (!vbmap) {
> +        errno = ENOMEM;
> +
> +        return NULL;
> +    }
> +
> +    vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
> +    vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
> +                                         BITS_PER_BYTE;
> +    vbmap->bitmap = g_try_malloc0(vbmap->size);
> +    if (!vbmap->bitmap) {
> +        g_free(vbmap);
> +        errno = ENOMEM;
> +
> +        return NULL;
> +    }
> +
> +    return vbmap;
> +}
> +
> +static void vfio_bitmap_dealloc(VFIOBitmap *vbmap)
> +{
> +    g_free(vbmap->bitmap);
> +    g_free(vbmap);
> +}
> +
>   bool vfio_mig_active(void)
>   {
>       VFIOGroup *group;
> @@ -470,9 +505,14 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
>   {
>       struct vfio_iommu_type1_dma_unmap *unmap;
>       struct vfio_bitmap *bitmap;
> -    uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
> +    VFIOBitmap *vbmap;
>       int ret;
>   
> +    vbmap = vfio_bitmap_alloc(size);
> +    if (!vbmap) {
> +        return -errno;
> +    }
> +
>       unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
>   
>       unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
> @@ -486,35 +526,28 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
>        * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
>        * to qemu_real_host_page_size.
>        */
> -
>       bitmap->pgsize = qemu_real_host_page_size();
> -    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
> -                   BITS_PER_BYTE;
> +    bitmap->size = vbmap->size;
> +    bitmap->data = (__u64 *)vbmap->bitmap;
>   
> -    if (bitmap->size > container->max_dirty_bitmap_size) {
> -        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
> -                     (uint64_t)bitmap->size);
> +    if (vbmap->size > container->max_dirty_bitmap_size) {
> +        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap->size);
>           ret = -E2BIG;
>           goto unmap_exit;
>       }
>   
> -    bitmap->data = g_try_malloc0(bitmap->size);
> -    if (!bitmap->data) {
> -        ret = -ENOMEM;
> -        goto unmap_exit;
> -    }
> -
>       ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
>       if (!ret) {
> -        cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
> -                iotlb->translated_addr, pages);
> +        cpu_physical_memory_set_dirty_lebitmap(vbmap->bitmap,
> +                iotlb->translated_addr, vbmap->pages);
>       } else {
>           error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
>       }
>   
> -    g_free(bitmap->data);
>   unmap_exit:
>       g_free(unmap);
> +    vfio_bitmap_dealloc(vbmap);
> +
>       return ret;
>   }
>   
> @@ -1331,7 +1364,7 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
>   {
>       struct vfio_iommu_type1_dirty_bitmap *dbitmap;
>       struct vfio_iommu_type1_dirty_bitmap_get *range;
> -    uint64_t pages;
> +    VFIOBitmap *vbmap;
>       int ret;
>   
>       if (!container->dirty_pages_supported) {
> @@ -1341,6 +1374,11 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
>           return 0;
>       }
>   
> +    vbmap = vfio_bitmap_alloc(size);
> +    if (!vbmap) {
> +        return -errno;
> +    }
> +
>       dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
>   
>       dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
> @@ -1355,15 +1393,8 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
>        * to qemu_real_host_page_size.
>        */
>       range->bitmap.pgsize = qemu_real_host_page_size();
> -
> -    pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size();
> -    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
> -                                         BITS_PER_BYTE;
> -    range->bitmap.data = g_try_malloc0(range->bitmap.size);
> -    if (!range->bitmap.data) {
> -        ret = -ENOMEM;
> -        goto err_out;
> -    }
> +    range->bitmap.size = vbmap->size;
> +    range->bitmap.data = (__u64 *)vbmap->bitmap;
>   
>       ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
>       if (ret) {
> @@ -1374,14 +1405,14 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
>           goto err_out;
>       }
>   
> -    cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
> -                                            ram_addr, pages);
> +    cpu_physical_memory_set_dirty_lebitmap(vbmap->bitmap, ram_addr,
> +                                           vbmap->pages);
>   
>       trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
>                                   range->bitmap.size, ram_addr);
>   err_out:
> -    g_free(range->bitmap.data);
>       g_free(dbitmap);
> +    vfio_bitmap_dealloc(vbmap);
>   
>       return ret;
>   }
Avihai Horon March 1, 2023, 6:56 p.m. UTC | #4
On 27/02/2023 16:09, Cédric Le Goater wrote:
> External email: Use caution opening links or attachments
>
>
> On 2/22/23 18:49, Avihai Horon wrote:
>> There are already two places where dirty page bitmap allocation and
>> calculations are done in open code. With device dirty page tracking
>> being added in next patches, there are going to be even more places.
>>
>> To avoid code duplication, introduce VFIOBitmap struct and corresponding
>> alloc and dealloc functions and use them where applicable.
>>
>> Signed-off-by: Avihai Horon <avihaih@nvidia.com>
>> ---
>>   hw/vfio/common.c | 89 ++++++++++++++++++++++++++++++++----------------
>>   1 file changed, 60 insertions(+), 29 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index ac93b85632..84f08bdbbb 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -320,6 +320,41 @@ const MemoryRegionOps vfio_region_ops = {
>>    * Device state interfaces
>>    */
>>
>> +typedef struct {
>> +    unsigned long *bitmap;
>> +    hwaddr size;
>> +    hwaddr pages;
>> +} VFIOBitmap;
>> +
>> +static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
>> +{
>> +    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);
>
> I think using g_malloc0() for the VFIOBitmap should be fine. If QEMU can
> not allocate a couple of bytes, we are in trouble anyway.
>
Sure, this will simplify the code a bit. I will change it.

Thanks.

>
>
>> +    if (!vbmap) {
>> +        errno = ENOMEM;
>> +
>> +        return NULL;
>> +    }
>> +
>> +    vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / 
>> qemu_real_host_page_size();
>> +    vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * 
>> BITS_PER_BYTE) /
>> +                                         BITS_PER_BYTE;
>> +    vbmap->bitmap = g_try_malloc0(vbmap->size);
>> +    if (!vbmap->bitmap) {
>> +        g_free(vbmap);
>> +        errno = ENOMEM;
>> +
>> +        return NULL;
>> +    }
>> +
>> +    return vbmap;
>> +}
>> +
>> +static void vfio_bitmap_dealloc(VFIOBitmap *vbmap)
>> +{
>> +    g_free(vbmap->bitmap);
>> +    g_free(vbmap);
>> +}
>> +
>>   bool vfio_mig_active(void)
>>   {
>>       VFIOGroup *group;
>> @@ -470,9 +505,14 @@ static int vfio_dma_unmap_bitmap(VFIOContainer 
>> *container,
>>   {
>>       struct vfio_iommu_type1_dma_unmap *unmap;
>>       struct vfio_bitmap *bitmap;
>> -    uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / 
>> qemu_real_host_page_size();
>> +    VFIOBitmap *vbmap;
>>       int ret;
>>
>> +    vbmap = vfio_bitmap_alloc(size);
>> +    if (!vbmap) {
>> +        return -errno;
>> +    }
>> +
>>       unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
>>
>>       unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
>> @@ -486,35 +526,28 @@ static int vfio_dma_unmap_bitmap(VFIOContainer 
>> *container,
>>        * qemu_real_host_page_size to mark those dirty. Hence set 
>> bitmap_pgsize
>>        * to qemu_real_host_page_size.
>>        */
>> -
>>       bitmap->pgsize = qemu_real_host_page_size();
>> -    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
>> -                   BITS_PER_BYTE;
>> +    bitmap->size = vbmap->size;
>> +    bitmap->data = (__u64 *)vbmap->bitmap;
>>
>> -    if (bitmap->size > container->max_dirty_bitmap_size) {
>> -        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
>> -                     (uint64_t)bitmap->size);
>> +    if (vbmap->size > container->max_dirty_bitmap_size) {
>> +        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, 
>> vbmap->size);
>>           ret = -E2BIG;
>>           goto unmap_exit;
>>       }
>>
>> -    bitmap->data = g_try_malloc0(bitmap->size);
>> -    if (!bitmap->data) {
>> -        ret = -ENOMEM;
>> -        goto unmap_exit;
>> -    }
>> -
>>       ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
>>       if (!ret) {
>> -        cpu_physical_memory_set_dirty_lebitmap((unsigned long 
>> *)bitmap->data,
>> -                iotlb->translated_addr, pages);
>> + cpu_physical_memory_set_dirty_lebitmap(vbmap->bitmap,
>> +                iotlb->translated_addr, vbmap->pages);
>>       } else {
>>           error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
>>       }
>>
>> -    g_free(bitmap->data);
>>   unmap_exit:
>>       g_free(unmap);
>> +    vfio_bitmap_dealloc(vbmap);
>> +
>>       return ret;
>>   }
>>
>> @@ -1331,7 +1364,7 @@ static int vfio_get_dirty_bitmap(VFIOContainer 
>> *container, uint64_t iova,
>>   {
>>       struct vfio_iommu_type1_dirty_bitmap *dbitmap;
>>       struct vfio_iommu_type1_dirty_bitmap_get *range;
>> -    uint64_t pages;
>> +    VFIOBitmap *vbmap;
>>       int ret;
>>
>>       if (!container->dirty_pages_supported) {
>> @@ -1341,6 +1374,11 @@ static int vfio_get_dirty_bitmap(VFIOContainer 
>> *container, uint64_t iova,
>>           return 0;
>>       }
>>
>> +    vbmap = vfio_bitmap_alloc(size);
>> +    if (!vbmap) {
>> +        return -errno;
>> +    }
>> +
>>       dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
>>
>>       dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
>> @@ -1355,15 +1393,8 @@ static int vfio_get_dirty_bitmap(VFIOContainer 
>> *container, uint64_t iova,
>>        * to qemu_real_host_page_size.
>>        */
>>       range->bitmap.pgsize = qemu_real_host_page_size();
>> -
>> -    pages = REAL_HOST_PAGE_ALIGN(range->size) / 
>> qemu_real_host_page_size();
>> -    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * 
>> BITS_PER_BYTE) /
>> -                                         BITS_PER_BYTE;
>> -    range->bitmap.data = g_try_malloc0(range->bitmap.size);
>> -    if (!range->bitmap.data) {
>> -        ret = -ENOMEM;
>> -        goto err_out;
>> -    }
>> +    range->bitmap.size = vbmap->size;
>> +    range->bitmap.data = (__u64 *)vbmap->bitmap;
>>
>>       ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
>>       if (ret) {
>> @@ -1374,14 +1405,14 @@ static int 
>> vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
>>           goto err_out;
>>       }
>>
>> -    cpu_physical_memory_set_dirty_lebitmap((unsigned long 
>> *)range->bitmap.data,
>> -                                            ram_addr, pages);
>> +    cpu_physical_memory_set_dirty_lebitmap(vbmap->bitmap, ram_addr,
>> +                                           vbmap->pages);
>>
>>       trace_vfio_get_dirty_bitmap(container->fd, range->iova, 
>> range->size,
>>                                   range->bitmap.size, ram_addr);
>>   err_out:
>> -    g_free(range->bitmap.data);
>>       g_free(dbitmap);
>> +    vfio_bitmap_dealloc(vbmap);
>>
>>       return ret;
>>   }
>
Joao Martins March 2, 2023, 1:24 p.m. UTC | #5
On 27/02/2023 14:09, Cédric Le Goater wrote:
> On 2/22/23 18:49, Avihai Horon wrote:
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -320,6 +320,41 @@ const MemoryRegionOps vfio_region_ops = {
>>    * Device state interfaces
>>    */
>>   +typedef struct {
>> +    unsigned long *bitmap;
>> +    hwaddr size;
>> +    hwaddr pages;
>> +} VFIOBitmap;
>> +
>> +static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
>> +{
>> +    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);
> 
> I think using g_malloc0() for the VFIOBitmap should be fine. If QEMU can
> not allocate a couple of bytes, we are in trouble anyway.
> 

OOM situations are rather unpredictable, and switching to g_malloc0 means we
will exit ungracefully in the middle of fetching dirty bitmaps. And this
function (vfio_bitmap_alloc) overall will be allocating megabytes for terabyte
guests.

It would be ok if we are initializing, but this is at runtime when we do
migration. I think we should stick with g_try_new0. exit on failure should be
reserved to failure to switch the kernel migration state whereby we are likely
to be dealing with a hardware failure and thus requires something more drastic.

	Joao
Cédric Le Goater March 2, 2023, 2:52 p.m. UTC | #6
Hello Joao,

On 3/2/23 14:24, Joao Martins wrote:
> On 27/02/2023 14:09, Cédric Le Goater wrote:
>> On 2/22/23 18:49, Avihai Horon wrote:
>>> --- a/hw/vfio/common.c
>>> +++ b/hw/vfio/common.c
>>> @@ -320,6 +320,41 @@ const MemoryRegionOps vfio_region_ops = {
>>>     * Device state interfaces
>>>     */
>>>    +typedef struct {
>>> +    unsigned long *bitmap;
>>> +    hwaddr size;
>>> +    hwaddr pages;
>>> +} VFIOBitmap;
>>> +
>>> +static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
>>> +{
>>> +    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);
>>
>> I think using g_malloc0() for the VFIOBitmap should be fine. If QEMU can
>> not allocate a couple of bytes, we are in trouble anyway.
>>
> 
> OOM situations are rather unpredictable, and switching to g_malloc0 means we
> will exit ungracefully in the middle of fetching dirty bitmaps. And this
> function (vfio_bitmap_alloc) overall will be allocating megabytes for terabyte
> guests.
> 
> It would be ok if we are initializing, but this is at runtime when we do
> migration. I think we should stick with g_try_new0. exit on failure should be
> reserved to failure to switch the kernel migration state whereby we are likely
> to be dealing with a hardware failure and thus requires something more drastic.

I agree for large allocation :

     vbmap->bitmap = g_try_malloc0(vbmap->size);

but not for the smaller ones, like VFIOBitmap. You would have to
convert some other g_malloc0() calls, like the one allocating 'unmap'
in vfio_dma_unmap_bitmap(), to be consistent.

Given the size of VFIOBitmap, I think it could live on the stack in
routine vfio_dma_unmap_bitmap() and routine vfio_get_dirty_bitmap()
since the reference is not kept.

The 'vbmap' attribute of vfio_giommu_dirty_notifier does not need
to be a pointer either.

vfio_bitmap_alloc(hwaddr size) could then become
vfio_bitmap_init(VFIOBitmap *vbmap, hwaddr size).

Anyhow, this is minor. It would simplify a bit the exit path
and error handling.

Thanks,

C.
Joao Martins March 2, 2023, 4:30 p.m. UTC | #7
On 02/03/2023 14:52, Cédric Le Goater wrote:
> Hello Joao,
> On 3/2/23 14:24, Joao Martins wrote:
>> On 27/02/2023 14:09, Cédric Le Goater wrote:
>>> On 2/22/23 18:49, Avihai Horon wrote:
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -320,6 +320,41 @@ const MemoryRegionOps vfio_region_ops = {
>>>>     * Device state interfaces
>>>>     */
>>>>    +typedef struct {
>>>> +    unsigned long *bitmap;
>>>> +    hwaddr size;
>>>> +    hwaddr pages;
>>>> +} VFIOBitmap;
>>>> +
>>>> +static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
>>>> +{
>>>> +    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);
>>>
>>> I think using g_malloc0() for the VFIOBitmap should be fine. If QEMU can
>>> not allocate a couple of bytes, we are in trouble anyway.
>>>
>>
>> OOM situations are rather unpredictable, and switching to g_malloc0 means we
>> will exit ungracefully in the middle of fetching dirty bitmaps. And this
>> function (vfio_bitmap_alloc) overall will be allocating megabytes for terabyte
>> guests.
>>
>> It would be ok if we are initializing, but this is at runtime when we do
>> migration. I think we should stick with g_try_new0. exit on failure should be
>> reserved to failure to switch the kernel migration state whereby we are likely
>> to be dealing with a hardware failure and thus requires something more drastic.
> 
> I agree for large allocation :
> 
>     vbmap->bitmap = g_try_malloc0(vbmap->size);
> 
> but not for the smaller ones, like VFIOBitmap. You would have to
> convert some other g_malloc0() calls, like the one allocating 'unmap'
> in vfio_dma_unmap_bitmap(), to be consistent.
> 
> Given the size of VFIOBitmap, I think it could live on the stack in
> routine vfio_dma_unmap_bitmap() and routine vfio_get_dirty_bitmap()
> since the reference is not kept.
> 

Both good points. Specially the g_malloc0 ones, though the dma unmap wouldn't be
in use for a device that supports dirty tracking. But there's one where we add
by mistake and that's the one vfio_device_feature_dma_logging_start_create(). It
shouldn't be g_malloc0 there too. The rest, except dma_unmap and type1-iommu
get_dirty_Bitmap functions, the others would argue that only happen in the
initialization.

> The 'vbmap' attribute of vfio_giommu_dirty_notifier does not need
> to be a pointer either.
> 
> vfio_bitmap_alloc(hwaddr size) could then become
> vfio_bitmap_init(VFIOBitmap *vbmap, hwaddr size).
> 
> Anyhow, this is minor. It would simplify a bit the exit path
> and error handling.
> 
By simplify presumably it's because vfio_bitmap_free() would be a single line
and thus avoiding the new helper and instead we would just live with the
vfio_bitmap_alloc(). I am at two minds with alloc vs init, considering we are
still allocating the actual bitmap. Still lingering more over staying with alloc
than init.
Joao Martins March 4, 2023, 12:23 a.m. UTC | #8
On 02/03/2023 14:52, Cédric Le Goater wrote:
> Hello Joao,
> 
> On 3/2/23 14:24, Joao Martins wrote:
>> On 27/02/2023 14:09, Cédric Le Goater wrote:
>>> On 2/22/23 18:49, Avihai Horon wrote:
>>>> --- a/hw/vfio/common.c
>>>> +++ b/hw/vfio/common.c
>>>> @@ -320,6 +320,41 @@ const MemoryRegionOps vfio_region_ops = {
>>>>     * Device state interfaces
>>>>     */
>>>>    +typedef struct {
>>>> +    unsigned long *bitmap;
>>>> +    hwaddr size;
>>>> +    hwaddr pages;
>>>> +} VFIOBitmap;
>>>> +
>>>> +static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
>>>> +{
>>>> +    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);
>>>
>>> I think using g_malloc0() for the VFIOBitmap should be fine. If QEMU can
>>> not allocate a couple of bytes, we are in trouble anyway.
>>>
>>
>> OOM situations are rather unpredictable, and switching to g_malloc0 means we
>> will exit ungracefully in the middle of fetching dirty bitmaps. And this
>> function (vfio_bitmap_alloc) overall will be allocating megabytes for terabyte
>> guests.
>>
>> It would be ok if we are initializing, but this is at runtime when we do
>> migration. I think we should stick with g_try_new0. exit on failure should be
>> reserved to failure to switch the kernel migration state whereby we are likely
>> to be dealing with a hardware failure and thus requires something more drastic.
> 
> I agree for large allocation :
> 
>     vbmap->bitmap = g_try_malloc0(vbmap->size);
> 
> but not for the smaller ones, like VFIOBitmap. You would have to
> convert some other g_malloc0() calls, like the one allocating 'unmap'
> in vfio_dma_unmap_bitmap(), to be consistent.
> 
> Given the size of VFIOBitmap, I think it could live on the stack in
> routine vfio_dma_unmap_bitmap() and routine vfio_get_dirty_bitmap()
> since the reference is not kept.
> 
> The 'vbmap' attribute of vfio_giommu_dirty_notifier does not need
> to be a pointer either.
> 
> vfio_bitmap_alloc(hwaddr size) could then become
> vfio_bitmap_init(VFIOBitmap *vbmap, hwaddr size).
> 
> Anyhow, this is minor. It would simplify a bit the exit path
> and error handling.
>

FWIW, I've addressed this on v3, following your suggestion.

	Joao
diff mbox series

Patch

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index ac93b85632..84f08bdbbb 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -320,6 +320,41 @@  const MemoryRegionOps vfio_region_ops = {
  * Device state interfaces
  */
 
+typedef struct {
+    unsigned long *bitmap;
+    hwaddr size;
+    hwaddr pages;
+} VFIOBitmap;
+
+static VFIOBitmap *vfio_bitmap_alloc(hwaddr size)
+{
+    VFIOBitmap *vbmap = g_try_new0(VFIOBitmap, 1);
+    if (!vbmap) {
+        errno = ENOMEM;
+
+        return NULL;
+    }
+
+    vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
+    vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
+                                         BITS_PER_BYTE;
+    vbmap->bitmap = g_try_malloc0(vbmap->size);
+    if (!vbmap->bitmap) {
+        g_free(vbmap);
+        errno = ENOMEM;
+
+        return NULL;
+    }
+
+    return vbmap;
+}
+
+static void vfio_bitmap_dealloc(VFIOBitmap *vbmap)
+{
+    g_free(vbmap->bitmap);
+    g_free(vbmap);
+}
+
 bool vfio_mig_active(void)
 {
     VFIOGroup *group;
@@ -470,9 +505,14 @@  static int vfio_dma_unmap_bitmap(VFIOContainer *container,
 {
     struct vfio_iommu_type1_dma_unmap *unmap;
     struct vfio_bitmap *bitmap;
-    uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
+    VFIOBitmap *vbmap;
     int ret;
 
+    vbmap = vfio_bitmap_alloc(size);
+    if (!vbmap) {
+        return -errno;
+    }
+
     unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
 
     unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
@@ -486,35 +526,28 @@  static int vfio_dma_unmap_bitmap(VFIOContainer *container,
      * qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
      * to qemu_real_host_page_size.
      */
-
     bitmap->pgsize = qemu_real_host_page_size();
-    bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
-                   BITS_PER_BYTE;
+    bitmap->size = vbmap->size;
+    bitmap->data = (__u64 *)vbmap->bitmap;
 
-    if (bitmap->size > container->max_dirty_bitmap_size) {
-        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
-                     (uint64_t)bitmap->size);
+    if (vbmap->size > container->max_dirty_bitmap_size) {
+        error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap->size);
         ret = -E2BIG;
         goto unmap_exit;
     }
 
-    bitmap->data = g_try_malloc0(bitmap->size);
-    if (!bitmap->data) {
-        ret = -ENOMEM;
-        goto unmap_exit;
-    }
-
     ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
     if (!ret) {
-        cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
-                iotlb->translated_addr, pages);
+        cpu_physical_memory_set_dirty_lebitmap(vbmap->bitmap,
+                iotlb->translated_addr, vbmap->pages);
     } else {
         error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
     }
 
-    g_free(bitmap->data);
 unmap_exit:
     g_free(unmap);
+    vfio_bitmap_dealloc(vbmap);
+
     return ret;
 }
 
@@ -1331,7 +1364,7 @@  static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
 {
     struct vfio_iommu_type1_dirty_bitmap *dbitmap;
     struct vfio_iommu_type1_dirty_bitmap_get *range;
-    uint64_t pages;
+    VFIOBitmap *vbmap;
     int ret;
 
     if (!container->dirty_pages_supported) {
@@ -1341,6 +1374,11 @@  static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
         return 0;
     }
 
+    vbmap = vfio_bitmap_alloc(size);
+    if (!vbmap) {
+        return -errno;
+    }
+
     dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
 
     dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
@@ -1355,15 +1393,8 @@  static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
      * to qemu_real_host_page_size.
      */
     range->bitmap.pgsize = qemu_real_host_page_size();
-
-    pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size();
-    range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
-                                         BITS_PER_BYTE;
-    range->bitmap.data = g_try_malloc0(range->bitmap.size);
-    if (!range->bitmap.data) {
-        ret = -ENOMEM;
-        goto err_out;
-    }
+    range->bitmap.size = vbmap->size;
+    range->bitmap.data = (__u64 *)vbmap->bitmap;
 
     ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
     if (ret) {
@@ -1374,14 +1405,14 @@  static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
         goto err_out;
     }
 
-    cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
-                                            ram_addr, pages);
+    cpu_physical_memory_set_dirty_lebitmap(vbmap->bitmap, ram_addr,
+                                           vbmap->pages);
 
     trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
                                 range->bitmap.size, ram_addr);
 err_out:
-    g_free(range->bitmap.data);
     g_free(dbitmap);
+    vfio_bitmap_dealloc(vbmap);
 
     return ret;
 }