diff mbox series

[v3,6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once

Message ID 20190810193155.58637-7-vsementsov@virtuozzo.com
State New
Headers show
Series backup improvements | expand

Commit Message

Vladimir Sementsov-Ogievskiy Aug. 10, 2019, 7:31 p.m. UTC
backup_cow_with_offload can transfer more than one cluster. Let
backup_cow_with_bounce_buffer behave similarly. It reduces the number
of IO requests, since there is no need to copy cluster by cluster.

Logic around bounce_buffer allocation changed: we can't just allocate
one-cluster-sized buffer to share for all iterations. We can't also
allocate buffer of full-request length it may be too large, so
BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
is to allocate a buffer sufficient to handle all remaining iterations
at the point where we need the buffer for the first time.

Bonus: get rid of pointer-to-pointer.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 24 deletions(-)

Comments

Max Reitz Aug. 12, 2019, 3:10 p.m. UTC | #1
On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
> backup_cow_with_offload can transfer more than one cluster. Let
> backup_cow_with_bounce_buffer behave similarly. It reduces the number
> of IO requests, since there is no need to copy cluster by cluster.
> 
> Logic around bounce_buffer allocation changed: we can't just allocate
> one-cluster-sized buffer to share for all iterations. We can't also
> allocate buffer of full-request length it may be too large, so
> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
> is to allocate a buffer sufficient to handle all remaining iterations
> at the point where we need the buffer for the first time.
> 
> Bonus: get rid of pointer-to-pointer.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>  1 file changed, 41 insertions(+), 24 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index d482d93458..65f7212c85 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -27,6 +27,7 @@
>  #include "qemu/error-report.h"
>  
>  #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>  
>  typedef struct CowRequest {
>      int64_t start_byte;
> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>      qemu_co_queue_restart_all(&req->wait_queue);
>  }
>  
> -/* Copy range to target with a bounce buffer and return the bytes copied. If
> - * error occurred, return a negative error number */
> +/*
> + * Copy range to target with a bounce buffer and return the bytes copied. If
> + * error occurred, return a negative error number
> + *
> + * @bounce_buffer is assumed to enough to store

s/to/to be/

> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
> + */
>  static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>                                                        int64_t start,
>                                                        int64_t end,
>                                                        bool is_write_notifier,
>                                                        bool *error_is_read,
> -                                                      void **bounce_buffer)
> +                                                      void *bounce_buffer)
>  {
>      int ret;
>      BlockBackend *blk = job->common.blk;
> -    int nbytes;
> +    int nbytes, remaining_bytes;
>      int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>  
>      assert(QEMU_IS_ALIGNED(start, job->cluster_size));
> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
> -    nbytes = MIN(job->cluster_size, job->len - start);
> -    if (!*bounce_buffer) {
> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
> -    }
> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
> +    nbytes = MIN(end - start, job->len - start);
>  
> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
> -    if (ret < 0) {
> -        trace_backup_do_cow_read_fail(job, start, ret);
> -        if (error_is_read) {
> -            *error_is_read = true;
> +
> +    remaining_bytes = nbytes;
> +    while (remaining_bytes) {
> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
> +
> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
> +        if (ret < 0) {
> +            trace_backup_do_cow_read_fail(job, start, ret);
> +            if (error_is_read) {
> +                *error_is_read = true;
> +            }
> +            goto fail;
>          }
> -        goto fail;
> -    }
>  
> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
> -                        job->write_flags);
> -    if (ret < 0) {
> -        trace_backup_do_cow_write_fail(job, start, ret);
> -        if (error_is_read) {
> -            *error_is_read = false;
> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
> +                            job->write_flags);
> +        if (ret < 0) {
> +            trace_backup_do_cow_write_fail(job, start, ret);
> +            if (error_is_read) {
> +                *error_is_read = false;
> +            }
> +            goto fail;
>          }
> -        goto fail;
> +
> +        start += chunk;
> +        remaining_bytes -= chunk;
>      }
>  
>      return nbytes;
> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>              }
>          }
>          if (!job->use_copy_range) {
> +            if (!bounce_buffer) {
> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
> +                                 MAX(dirty_end - start, end - dirty_end));
> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
> +            }

If you use _try_, you should probably also check whether it succeeded.

Anyway, I wonder whether it’d be better to just allocate this buffer
once per job (the first time we get here, probably) to be of size
BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
a buf-size parameter similar to what the mirror jobs have.)

Max

>              ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
>                                                  is_write_notifier,
> -                                                error_is_read, &bounce_buffer);
> +                                                error_is_read, bounce_buffer);
>          }
>          if (ret < 0) {
>              break;
>
Vladimir Sementsov-Ogievskiy Aug. 12, 2019, 3:47 p.m. UTC | #2
12.08.2019 18:10, Max Reitz wrote:
> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>> backup_cow_with_offload can transfer more than one cluster. Let
>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>> of IO requests, since there is no need to copy cluster by cluster.
>>
>> Logic around bounce_buffer allocation changed: we can't just allocate
>> one-cluster-sized buffer to share for all iterations. We can't also
>> allocate buffer of full-request length it may be too large, so
>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>> is to allocate a buffer sufficient to handle all remaining iterations
>> at the point where we need the buffer for the first time.
>>
>> Bonus: get rid of pointer-to-pointer.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>   1 file changed, 41 insertions(+), 24 deletions(-)
>>
>> diff --git a/block/backup.c b/block/backup.c
>> index d482d93458..65f7212c85 100644
>> --- a/block/backup.c
>> +++ b/block/backup.c
>> @@ -27,6 +27,7 @@
>>   #include "qemu/error-report.h"
>>   
>>   #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>   
>>   typedef struct CowRequest {
>>       int64_t start_byte;
>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>       qemu_co_queue_restart_all(&req->wait_queue);
>>   }
>>   
>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>> - * error occurred, return a negative error number */
>> +/*
>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>> + * error occurred, return a negative error number
>> + *
>> + * @bounce_buffer is assumed to enough to store
> 
> s/to/to be/
> 
>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>> + */
>>   static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>                                                         int64_t start,
>>                                                         int64_t end,
>>                                                         bool is_write_notifier,
>>                                                         bool *error_is_read,
>> -                                                      void **bounce_buffer)
>> +                                                      void *bounce_buffer)
>>   {
>>       int ret;
>>       BlockBackend *blk = job->common.blk;
>> -    int nbytes;
>> +    int nbytes, remaining_bytes;
>>       int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>   
>>       assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>> -    nbytes = MIN(job->cluster_size, job->len - start);
>> -    if (!*bounce_buffer) {
>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>> -    }
>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>> +    nbytes = MIN(end - start, job->len - start);
>>   
>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>> -    if (ret < 0) {
>> -        trace_backup_do_cow_read_fail(job, start, ret);
>> -        if (error_is_read) {
>> -            *error_is_read = true;
>> +
>> +    remaining_bytes = nbytes;
>> +    while (remaining_bytes) {
>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>> +
>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>> +        if (ret < 0) {
>> +            trace_backup_do_cow_read_fail(job, start, ret);
>> +            if (error_is_read) {
>> +                *error_is_read = true;
>> +            }
>> +            goto fail;
>>           }
>> -        goto fail;
>> -    }
>>   
>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>> -                        job->write_flags);
>> -    if (ret < 0) {
>> -        trace_backup_do_cow_write_fail(job, start, ret);
>> -        if (error_is_read) {
>> -            *error_is_read = false;
>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>> +                            job->write_flags);
>> +        if (ret < 0) {
>> +            trace_backup_do_cow_write_fail(job, start, ret);
>> +            if (error_is_read) {
>> +                *error_is_read = false;
>> +            }
>> +            goto fail;
>>           }
>> -        goto fail;
>> +
>> +        start += chunk;
>> +        remaining_bytes -= chunk;
>>       }
>>   
>>       return nbytes;
>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>               }
>>           }
>>           if (!job->use_copy_range) {
>> +            if (!bounce_buffer) {
>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>> +                                 MAX(dirty_end - start, end - dirty_end));
>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>> +            }
> 
> If you use _try_, you should probably also check whether it succeeded.

Oops, you are right, of course.

> 
> Anyway, I wonder whether it’d be better to just allocate this buffer
> once per job (the first time we get here, probably) to be of size
> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
> a buf-size parameter similar to what the mirror jobs have.)
> 

Once per job will not work, as we may have several guest writes in parallel and therefore
several parallel copy-before-write operations. Or if you mean writing an allocator based
on once-allocated buffer like in mirror, I really dislike this idea, as we already have
allocator: memalign/malloc/free and it works well, no reason to invent a new one and
hardcode it into block layer (look at my answer to Eric on v2 of this patch for more info).

Or, if you mean only backup_loop generated copy-requests, yes we may keep only one buffer for them,
but:
1. it is not how it works now, so my patch is not a degradation in this case
2. I'm going to parallelize backup loop too, like my series "qcow2: async handling of fragmented io",
    which will need several allocated buffers anyway.

> 
>>               ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
>>                                                   is_write_notifier,
>> -                                                error_is_read, &bounce_buffer);
>> +                                                error_is_read, bounce_buffer);
>>           }
>>           if (ret < 0) {
>>               break;
>>
> 
>
Max Reitz Aug. 12, 2019, 4:11 p.m. UTC | #3
On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
> 12.08.2019 18:10, Max Reitz wrote:
>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>> backup_cow_with_offload can transfer more than one cluster. Let
>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>> of IO requests, since there is no need to copy cluster by cluster.
>>>
>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>> one-cluster-sized buffer to share for all iterations. We can't also
>>> allocate buffer of full-request length it may be too large, so
>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>> is to allocate a buffer sufficient to handle all remaining iterations
>>> at the point where we need the buffer for the first time.
>>>
>>> Bonus: get rid of pointer-to-pointer.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---
>>>   block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>   1 file changed, 41 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/block/backup.c b/block/backup.c
>>> index d482d93458..65f7212c85 100644
>>> --- a/block/backup.c
>>> +++ b/block/backup.c
>>> @@ -27,6 +27,7 @@
>>>   #include "qemu/error-report.h"
>>>   
>>>   #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>   
>>>   typedef struct CowRequest {
>>>       int64_t start_byte;
>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>       qemu_co_queue_restart_all(&req->wait_queue);
>>>   }
>>>   
>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>> - * error occurred, return a negative error number */
>>> +/*
>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>> + * error occurred, return a negative error number
>>> + *
>>> + * @bounce_buffer is assumed to enough to store
>>
>> s/to/to be/
>>
>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>> + */
>>>   static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>                                                         int64_t start,
>>>                                                         int64_t end,
>>>                                                         bool is_write_notifier,
>>>                                                         bool *error_is_read,
>>> -                                                      void **bounce_buffer)
>>> +                                                      void *bounce_buffer)
>>>   {
>>>       int ret;
>>>       BlockBackend *blk = job->common.blk;
>>> -    int nbytes;
>>> +    int nbytes, remaining_bytes;
>>>       int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>   
>>>       assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>> -    if (!*bounce_buffer) {
>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>> -    }
>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>> +    nbytes = MIN(end - start, job->len - start);
>>>   
>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>> -    if (ret < 0) {
>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>> -        if (error_is_read) {
>>> -            *error_is_read = true;
>>> +
>>> +    remaining_bytes = nbytes;
>>> +    while (remaining_bytes) {
>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>> +
>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>> +        if (ret < 0) {
>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>> +            if (error_is_read) {
>>> +                *error_is_read = true;
>>> +            }
>>> +            goto fail;
>>>           }
>>> -        goto fail;
>>> -    }
>>>   
>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>> -                        job->write_flags);
>>> -    if (ret < 0) {
>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>> -        if (error_is_read) {
>>> -            *error_is_read = false;
>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>> +                            job->write_flags);
>>> +        if (ret < 0) {
>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>> +            if (error_is_read) {
>>> +                *error_is_read = false;
>>> +            }
>>> +            goto fail;
>>>           }
>>> -        goto fail;
>>> +
>>> +        start += chunk;
>>> +        remaining_bytes -= chunk;
>>>       }
>>>   
>>>       return nbytes;
>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>               }
>>>           }
>>>           if (!job->use_copy_range) {
>>> +            if (!bounce_buffer) {
>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>> +            }
>>
>> If you use _try_, you should probably also check whether it succeeded.
> 
> Oops, you are right, of course.
> 
>>
>> Anyway, I wonder whether it’d be better to just allocate this buffer
>> once per job (the first time we get here, probably) to be of size
>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>> a buf-size parameter similar to what the mirror jobs have.)
>>
> 
> Once per job will not work, as we may have several guest writes in parallel and therefore
> several parallel copy-before-write operations.

Hm.  I’m not quite happy with that because if the guest just issues many
large discards in parallel, this means that qemu will allocate a large
amount of memory.

It would be nice if there was a simple way to keep track of the total
memory usage and let requests yield if they would exceed it.

> Or if you mean writing an allocator based
> on once-allocated buffer like in mirror, I really dislike this idea, as we already have
> allocator: memalign/malloc/free and it works well, no reason to invent a new one and
> hardcode it into block layer (look at my answer to Eric on v2 of this patch for more info).

Well, at least it’d be something we can delay until blockdev-copy
arrives(TM).

Max

> Or, if you mean only backup_loop generated copy-requests, yes we may keep only one buffer for them,
> but:
> 1. it is not how it works now, so my patch is not a degradation in this case
> 2. I'm going to parallelize backup loop too, like my series "qcow2: async handling of fragmented io",
>     which will need several allocated buffers anyway.
> 
>>
>>>               ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
>>>                                                   is_write_notifier,
>>> -                                                error_is_read, &bounce_buffer);
>>> +                                                error_is_read, bounce_buffer);
>>>           }
>>>           if (ret < 0) {
>>>               break;
>>>
>>
>>
> 
>
Vladimir Sementsov-Ogievskiy Aug. 12, 2019, 4:37 p.m. UTC | #4
12.08.2019 19:11, Max Reitz wrote:
> On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
>> 12.08.2019 18:10, Max Reitz wrote:
>>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>>> backup_cow_with_offload can transfer more than one cluster. Let
>>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>>> of IO requests, since there is no need to copy cluster by cluster.
>>>>
>>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>>> one-cluster-sized buffer to share for all iterations. We can't also
>>>> allocate buffer of full-request length it may be too large, so
>>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>>> is to allocate a buffer sufficient to handle all remaining iterations
>>>> at the point where we need the buffer for the first time.
>>>>
>>>> Bonus: get rid of pointer-to-pointer.
>>>>
>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>> ---
>>>>    block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>>    1 file changed, 41 insertions(+), 24 deletions(-)
>>>>
>>>> diff --git a/block/backup.c b/block/backup.c
>>>> index d482d93458..65f7212c85 100644
>>>> --- a/block/backup.c
>>>> +++ b/block/backup.c
>>>> @@ -27,6 +27,7 @@
>>>>    #include "qemu/error-report.h"
>>>>    
>>>>    #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>>    
>>>>    typedef struct CowRequest {
>>>>        int64_t start_byte;
>>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>>        qemu_co_queue_restart_all(&req->wait_queue);
>>>>    }
>>>>    
>>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>>> - * error occurred, return a negative error number */
>>>> +/*
>>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>>> + * error occurred, return a negative error number
>>>> + *
>>>> + * @bounce_buffer is assumed to enough to store
>>>
>>> s/to/to be/
>>>
>>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>>> + */
>>>>    static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>>                                                          int64_t start,
>>>>                                                          int64_t end,
>>>>                                                          bool is_write_notifier,
>>>>                                                          bool *error_is_read,
>>>> -                                                      void **bounce_buffer)
>>>> +                                                      void *bounce_buffer)
>>>>    {
>>>>        int ret;
>>>>        BlockBackend *blk = job->common.blk;
>>>> -    int nbytes;
>>>> +    int nbytes, remaining_bytes;
>>>>        int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>>    
>>>>        assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>>> -    if (!*bounce_buffer) {
>>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>>> -    }
>>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>>> +    nbytes = MIN(end - start, job->len - start);
>>>>    
>>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>>> -    if (ret < 0) {
>>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>>> -        if (error_is_read) {
>>>> -            *error_is_read = true;
>>>> +
>>>> +    remaining_bytes = nbytes;
>>>> +    while (remaining_bytes) {
>>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>>> +
>>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>>> +        if (ret < 0) {
>>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>>> +            if (error_is_read) {
>>>> +                *error_is_read = true;
>>>> +            }
>>>> +            goto fail;
>>>>            }
>>>> -        goto fail;
>>>> -    }
>>>>    
>>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>>> -                        job->write_flags);
>>>> -    if (ret < 0) {
>>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>>> -        if (error_is_read) {
>>>> -            *error_is_read = false;
>>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>>> +                            job->write_flags);
>>>> +        if (ret < 0) {
>>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>>> +            if (error_is_read) {
>>>> +                *error_is_read = false;
>>>> +            }
>>>> +            goto fail;
>>>>            }
>>>> -        goto fail;
>>>> +
>>>> +        start += chunk;
>>>> +        remaining_bytes -= chunk;
>>>>        }
>>>>    
>>>>        return nbytes;
>>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>>                }
>>>>            }
>>>>            if (!job->use_copy_range) {
>>>> +            if (!bounce_buffer) {
>>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>>> +            }
>>>
>>> If you use _try_, you should probably also check whether it succeeded.
>>
>> Oops, you are right, of course.
>>
>>>
>>> Anyway, I wonder whether it’d be better to just allocate this buffer
>>> once per job (the first time we get here, probably) to be of size
>>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>>> a buf-size parameter similar to what the mirror jobs have.)
>>>
>>
>> Once per job will not work, as we may have several guest writes in parallel and therefore
>> several parallel copy-before-write operations.
> 
> Hm.  I’m not quite happy with that because if the guest just issues many
> large discards in parallel, this means that qemu will allocate a large
> amount of memory.
> 
> It would be nice if there was a simple way to keep track of the total
> memory usage and let requests yield if they would exceed it.

Agree, it should be fixed anyway.

> 
>> Or if you mean writing an allocator based
>> on once-allocated buffer like in mirror, I really dislike this idea, as we already have
>> allocator: memalign/malloc/free and it works well, no reason to invent a new one and
>> hardcode it into block layer (look at my answer to Eric on v2 of this patch for more info).
> 
> Well, at least it’d be something we can delay until blockdev-copy
> arrives(TM).
> 
> Max
> 
>> Or, if you mean only backup_loop generated copy-requests, yes we may keep only one buffer for them,
>> but:
>> 1. it is not how it works now, so my patch is not a degradation in this case
>> 2. I'm going to parallelize backup loop too, like my series "qcow2: async handling of fragmented io",
>>      which will need several allocated buffers anyway.
>>
>>>
>>>>                ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
>>>>                                                    is_write_notifier,
>>>> -                                                error_is_read, &bounce_buffer);
>>>> +                                                error_is_read, bounce_buffer);
>>>>            }
>>>>            if (ret < 0) {
>>>>                break;
>>>>
>>>
>>>
>>
>>
> 
>
Vladimir Sementsov-Ogievskiy Aug. 13, 2019, 2:14 p.m. UTC | #5
12.08.2019 19:37, Vladimir Sementsov-Ogievskiy wrote:
> 12.08.2019 19:11, Max Reitz wrote:
>> On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
>>> 12.08.2019 18:10, Max Reitz wrote:
>>>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>>>> backup_cow_with_offload can transfer more than one cluster. Let
>>>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>>>> of IO requests, since there is no need to copy cluster by cluster.
>>>>>
>>>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>>>> one-cluster-sized buffer to share for all iterations. We can't also
>>>>> allocate buffer of full-request length it may be too large, so
>>>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>>>> is to allocate a buffer sufficient to handle all remaining iterations
>>>>> at the point where we need the buffer for the first time.
>>>>>
>>>>> Bonus: get rid of pointer-to-pointer.
>>>>>
>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>> ---
>>>>>    block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>>>    1 file changed, 41 insertions(+), 24 deletions(-)
>>>>>
>>>>> diff --git a/block/backup.c b/block/backup.c
>>>>> index d482d93458..65f7212c85 100644
>>>>> --- a/block/backup.c
>>>>> +++ b/block/backup.c
>>>>> @@ -27,6 +27,7 @@
>>>>>    #include "qemu/error-report.h"
>>>>>    #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>>>    typedef struct CowRequest {
>>>>>        int64_t start_byte;
>>>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>>>        qemu_co_queue_restart_all(&req->wait_queue);
>>>>>    }
>>>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>>>> - * error occurred, return a negative error number */
>>>>> +/*
>>>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>>>> + * error occurred, return a negative error number
>>>>> + *
>>>>> + * @bounce_buffer is assumed to enough to store
>>>>
>>>> s/to/to be/
>>>>
>>>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>>>> + */
>>>>>    static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>>>                                                          int64_t start,
>>>>>                                                          int64_t end,
>>>>>                                                          bool is_write_notifier,
>>>>>                                                          bool *error_is_read,
>>>>> -                                                      void **bounce_buffer)
>>>>> +                                                      void *bounce_buffer)
>>>>>    {
>>>>>        int ret;
>>>>>        BlockBackend *blk = job->common.blk;
>>>>> -    int nbytes;
>>>>> +    int nbytes, remaining_bytes;
>>>>>        int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>>>        assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>>>> -    if (!*bounce_buffer) {
>>>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>>>> -    }
>>>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>>>> +    nbytes = MIN(end - start, job->len - start);
>>>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>>>> -    if (ret < 0) {
>>>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>>>> -        if (error_is_read) {
>>>>> -            *error_is_read = true;
>>>>> +
>>>>> +    remaining_bytes = nbytes;
>>>>> +    while (remaining_bytes) {
>>>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>>>> +
>>>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>>>> +        if (ret < 0) {
>>>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>>>> +            if (error_is_read) {
>>>>> +                *error_is_read = true;
>>>>> +            }
>>>>> +            goto fail;
>>>>>            }
>>>>> -        goto fail;
>>>>> -    }
>>>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>>>> -                        job->write_flags);
>>>>> -    if (ret < 0) {
>>>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>>>> -        if (error_is_read) {
>>>>> -            *error_is_read = false;
>>>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>>>> +                            job->write_flags);
>>>>> +        if (ret < 0) {
>>>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>>>> +            if (error_is_read) {
>>>>> +                *error_is_read = false;
>>>>> +            }
>>>>> +            goto fail;
>>>>>            }
>>>>> -        goto fail;
>>>>> +
>>>>> +        start += chunk;
>>>>> +        remaining_bytes -= chunk;
>>>>>        }
>>>>>        return nbytes;
>>>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>>>                }
>>>>>            }
>>>>>            if (!job->use_copy_range) {
>>>>> +            if (!bounce_buffer) {
>>>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>>>> +            }
>>>>
>>>> If you use _try_, you should probably also check whether it succeeded.
>>>
>>> Oops, you are right, of course.
>>>
>>>>
>>>> Anyway, I wonder whether it’d be better to just allocate this buffer
>>>> once per job (the first time we get here, probably) to be of size
>>>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>>>> a buf-size parameter similar to what the mirror jobs have.)
>>>>
>>>
>>> Once per job will not work, as we may have several guest writes in parallel and therefore
>>> several parallel copy-before-write operations.
>>
>> Hm.  I’m not quite happy with that because if the guest just issues many
>> large discards in parallel, this means that qemu will allocate a large
>> amount of memory.
>>
>> It would be nice if there was a simple way to keep track of the total
>> memory usage and let requests yield if they would exceed it.
> 
> Agree, it should be fixed anyway.
> 


But still..

Synchronous mirror allocates full-request buffers on guest write. Is it correct?

If we assume that it is correct to double memory usage of guest operations, than for backup
the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
improvement to be after backup-top filter merged.
Max Reitz Aug. 13, 2019, 2:23 p.m. UTC | #6
On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:
> 12.08.2019 19:37, Vladimir Sementsov-Ogievskiy wrote:
>> 12.08.2019 19:11, Max Reitz wrote:
>>> On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
>>>> 12.08.2019 18:10, Max Reitz wrote:
>>>>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> backup_cow_with_offload can transfer more than one cluster. Let
>>>>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>>>>> of IO requests, since there is no need to copy cluster by cluster.
>>>>>>
>>>>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>>>>> one-cluster-sized buffer to share for all iterations. We can't also
>>>>>> allocate buffer of full-request length it may be too large, so
>>>>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>>>>> is to allocate a buffer sufficient to handle all remaining iterations
>>>>>> at the point where we need the buffer for the first time.
>>>>>>
>>>>>> Bonus: get rid of pointer-to-pointer.
>>>>>>
>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>> ---
>>>>>>    block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>>>>    1 file changed, 41 insertions(+), 24 deletions(-)
>>>>>>
>>>>>> diff --git a/block/backup.c b/block/backup.c
>>>>>> index d482d93458..65f7212c85 100644
>>>>>> --- a/block/backup.c
>>>>>> +++ b/block/backup.c
>>>>>> @@ -27,6 +27,7 @@
>>>>>>    #include "qemu/error-report.h"
>>>>>>    #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>>>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>>>>    typedef struct CowRequest {
>>>>>>        int64_t start_byte;
>>>>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>>>>        qemu_co_queue_restart_all(&req->wait_queue);
>>>>>>    }
>>>>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>> - * error occurred, return a negative error number */
>>>>>> +/*
>>>>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>> + * error occurred, return a negative error number
>>>>>> + *
>>>>>> + * @bounce_buffer is assumed to enough to store
>>>>>
>>>>> s/to/to be/
>>>>>
>>>>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>>>>> + */
>>>>>>    static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>>>>                                                          int64_t start,
>>>>>>                                                          int64_t end,
>>>>>>                                                          bool is_write_notifier,
>>>>>>                                                          bool *error_is_read,
>>>>>> -                                                      void **bounce_buffer)
>>>>>> +                                                      void *bounce_buffer)
>>>>>>    {
>>>>>>        int ret;
>>>>>>        BlockBackend *blk = job->common.blk;
>>>>>> -    int nbytes;
>>>>>> +    int nbytes, remaining_bytes;
>>>>>>        int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>>>>        assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>>>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>>>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>>>>> -    if (!*bounce_buffer) {
>>>>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>>>>> -    }
>>>>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>>>>> +    nbytes = MIN(end - start, job->len - start);
>>>>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>>>>> -    if (ret < 0) {
>>>>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>>>>> -        if (error_is_read) {
>>>>>> -            *error_is_read = true;
>>>>>> +
>>>>>> +    remaining_bytes = nbytes;
>>>>>> +    while (remaining_bytes) {
>>>>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>>>>> +
>>>>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>>>>> +        if (ret < 0) {
>>>>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>>>>> +            if (error_is_read) {
>>>>>> +                *error_is_read = true;
>>>>>> +            }
>>>>>> +            goto fail;
>>>>>>            }
>>>>>> -        goto fail;
>>>>>> -    }
>>>>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>>>>> -                        job->write_flags);
>>>>>> -    if (ret < 0) {
>>>>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>>>>> -        if (error_is_read) {
>>>>>> -            *error_is_read = false;
>>>>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>>>>> +                            job->write_flags);
>>>>>> +        if (ret < 0) {
>>>>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>>>>> +            if (error_is_read) {
>>>>>> +                *error_is_read = false;
>>>>>> +            }
>>>>>> +            goto fail;
>>>>>>            }
>>>>>> -        goto fail;
>>>>>> +
>>>>>> +        start += chunk;
>>>>>> +        remaining_bytes -= chunk;
>>>>>>        }
>>>>>>        return nbytes;
>>>>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>>>>                }
>>>>>>            }
>>>>>>            if (!job->use_copy_range) {
>>>>>> +            if (!bounce_buffer) {
>>>>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>>>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>>>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>>>>> +            }
>>>>>
>>>>> If you use _try_, you should probably also check whether it succeeded.
>>>>
>>>> Oops, you are right, of course.
>>>>
>>>>>
>>>>> Anyway, I wonder whether it’d be better to just allocate this buffer
>>>>> once per job (the first time we get here, probably) to be of size
>>>>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>>>>> a buf-size parameter similar to what the mirror jobs have.)
>>>>>
>>>>
>>>> Once per job will not work, as we may have several guest writes in parallel and therefore
>>>> several parallel copy-before-write operations.
>>>
>>> Hm.  I’m not quite happy with that because if the guest just issues many
>>> large discards in parallel, this means that qemu will allocate a large
>>> amount of memory.
>>>
>>> It would be nice if there was a simple way to keep track of the total
>>> memory usage and let requests yield if they would exceed it.
>>
>> Agree, it should be fixed anyway.
>>
> 
> 
> But still..
> 
> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
> 
> If we assume that it is correct to double memory usage of guest operations, than for backup
> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.

Well, but that is the problem.  I didn’t say anything in v2, because I
only thought of normal writes and I found it fine to double the memory
usage there (a guest won’t issue huge write requests in parallel).  But
discard/write-zeroes are a different matter.

> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
> improvement to be after backup-top filter merged.

But do you need to distinguish it?  Why not just keep track of memory
usage and put the current I/O coroutine to sleep in a CoQueue or
something, and wake that up at the end of backup_do_cow()?

Max
Vladimir Sementsov-Ogievskiy Aug. 13, 2019, 2:39 p.m. UTC | #7
13.08.2019 17:23, Max Reitz wrote:
> On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:
>> 12.08.2019 19:37, Vladimir Sementsov-Ogievskiy wrote:
>>> 12.08.2019 19:11, Max Reitz wrote:
>>>> On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
>>>>> 12.08.2019 18:10, Max Reitz wrote:
>>>>>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>> backup_cow_with_offload can transfer more than one cluster. Let
>>>>>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>>>>>> of IO requests, since there is no need to copy cluster by cluster.
>>>>>>>
>>>>>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>>>>>> one-cluster-sized buffer to share for all iterations. We can't also
>>>>>>> allocate buffer of full-request length it may be too large, so
>>>>>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>>>>>> is to allocate a buffer sufficient to handle all remaining iterations
>>>>>>> at the point where we need the buffer for the first time.
>>>>>>>
>>>>>>> Bonus: get rid of pointer-to-pointer.
>>>>>>>
>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>>> ---
>>>>>>>     block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>>>>>     1 file changed, 41 insertions(+), 24 deletions(-)
>>>>>>>
>>>>>>> diff --git a/block/backup.c b/block/backup.c
>>>>>>> index d482d93458..65f7212c85 100644
>>>>>>> --- a/block/backup.c
>>>>>>> +++ b/block/backup.c
>>>>>>> @@ -27,6 +27,7 @@
>>>>>>>     #include "qemu/error-report.h"
>>>>>>>     #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>>>>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>>>>>     typedef struct CowRequest {
>>>>>>>         int64_t start_byte;
>>>>>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>>>>>         qemu_co_queue_restart_all(&req->wait_queue);
>>>>>>>     }
>>>>>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>> - * error occurred, return a negative error number */
>>>>>>> +/*
>>>>>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>> + * error occurred, return a negative error number
>>>>>>> + *
>>>>>>> + * @bounce_buffer is assumed to enough to store
>>>>>>
>>>>>> s/to/to be/
>>>>>>
>>>>>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>>>>>> + */
>>>>>>>     static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>>>>>                                                           int64_t start,
>>>>>>>                                                           int64_t end,
>>>>>>>                                                           bool is_write_notifier,
>>>>>>>                                                           bool *error_is_read,
>>>>>>> -                                                      void **bounce_buffer)
>>>>>>> +                                                      void *bounce_buffer)
>>>>>>>     {
>>>>>>>         int ret;
>>>>>>>         BlockBackend *blk = job->common.blk;
>>>>>>> -    int nbytes;
>>>>>>> +    int nbytes, remaining_bytes;
>>>>>>>         int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>>>>>         assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>>>>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>>>>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>>>>>> -    if (!*bounce_buffer) {
>>>>>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>>>>>> -    }
>>>>>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>>>>>> +    nbytes = MIN(end - start, job->len - start);
>>>>>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>>>>>> -    if (ret < 0) {
>>>>>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>> -        if (error_is_read) {
>>>>>>> -            *error_is_read = true;
>>>>>>> +
>>>>>>> +    remaining_bytes = nbytes;
>>>>>>> +    while (remaining_bytes) {
>>>>>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>>>>>> +
>>>>>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>>>>>> +        if (ret < 0) {
>>>>>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>> +            if (error_is_read) {
>>>>>>> +                *error_is_read = true;
>>>>>>> +            }
>>>>>>> +            goto fail;
>>>>>>>             }
>>>>>>> -        goto fail;
>>>>>>> -    }
>>>>>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>>>>>> -                        job->write_flags);
>>>>>>> -    if (ret < 0) {
>>>>>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>> -        if (error_is_read) {
>>>>>>> -            *error_is_read = false;
>>>>>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>>>>>> +                            job->write_flags);
>>>>>>> +        if (ret < 0) {
>>>>>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>> +            if (error_is_read) {
>>>>>>> +                *error_is_read = false;
>>>>>>> +            }
>>>>>>> +            goto fail;
>>>>>>>             }
>>>>>>> -        goto fail;
>>>>>>> +
>>>>>>> +        start += chunk;
>>>>>>> +        remaining_bytes -= chunk;
>>>>>>>         }
>>>>>>>         return nbytes;
>>>>>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>>>>>                 }
>>>>>>>             }
>>>>>>>             if (!job->use_copy_range) {
>>>>>>> +            if (!bounce_buffer) {
>>>>>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>>>>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>>>>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>>>>>> +            }
>>>>>>
>>>>>> If you use _try_, you should probably also check whether it succeeded.
>>>>>
>>>>> Oops, you are right, of course.
>>>>>
>>>>>>
>>>>>> Anyway, I wonder whether it’d be better to just allocate this buffer
>>>>>> once per job (the first time we get here, probably) to be of size
>>>>>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>>>>>> a buf-size parameter similar to what the mirror jobs have.)
>>>>>>
>>>>>
>>>>> Once per job will not work, as we may have several guest writes in parallel and therefore
>>>>> several parallel copy-before-write operations.
>>>>
>>>> Hm.  I’m not quite happy with that because if the guest just issues many
>>>> large discards in parallel, this means that qemu will allocate a large
>>>> amount of memory.
>>>>
>>>> It would be nice if there was a simple way to keep track of the total
>>>> memory usage and let requests yield if they would exceed it.
>>>
>>> Agree, it should be fixed anyway.
>>>
>>
>>
>> But still..
>>
>> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
>>
>> If we assume that it is correct to double memory usage of guest operations, than for backup
>> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
> 
> Well, but that is the problem.  I didn’t say anything in v2, because I
> only thought of normal writes and I found it fine to double the memory
> usage there (a guest won’t issue huge write requests in parallel).  But
> discard/write-zeroes are a different matter.
> 
>> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
>> improvement to be after backup-top filter merged.
> 
> But do you need to distinguish it?  Why not just keep track of memory
> usage and put the current I/O coroutine to sleep in a CoQueue or
> something, and wake that up at the end of backup_do_cow()?
> 

1. Because if we _can_ allow doubling of memory, it's more effective to not restrict allocations on
guest writes. It's just seems to be more effective technique.

2. Anyway, I'd allow some always-available size to allocate - let it be one cluster, which will correspond
to current behavior and prevent guest io hang in worst case. So I mean, if we have enough memory allow
individual CBW operation to allocate the whole buffer, and if we have no extra memory allow to allocate one
cluster.
Max Reitz Aug. 13, 2019, 2:57 p.m. UTC | #8
On 13.08.19 16:39, Vladimir Sementsov-Ogievskiy wrote:
> 13.08.2019 17:23, Max Reitz wrote:
>> On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:
>>> 12.08.2019 19:37, Vladimir Sementsov-Ogievskiy wrote:
>>>> 12.08.2019 19:11, Max Reitz wrote:
>>>>> On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> 12.08.2019 18:10, Max Reitz wrote:
>>>>>>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>> backup_cow_with_offload can transfer more than one cluster. Let
>>>>>>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>>>>>>> of IO requests, since there is no need to copy cluster by cluster.
>>>>>>>>
>>>>>>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>>>>>>> one-cluster-sized buffer to share for all iterations. We can't also
>>>>>>>> allocate buffer of full-request length it may be too large, so
>>>>>>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>>>>>>> is to allocate a buffer sufficient to handle all remaining iterations
>>>>>>>> at the point where we need the buffer for the first time.
>>>>>>>>
>>>>>>>> Bonus: get rid of pointer-to-pointer.
>>>>>>>>
>>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>>>> ---
>>>>>>>>     block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>>>>>>     1 file changed, 41 insertions(+), 24 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/block/backup.c b/block/backup.c
>>>>>>>> index d482d93458..65f7212c85 100644
>>>>>>>> --- a/block/backup.c
>>>>>>>> +++ b/block/backup.c
>>>>>>>> @@ -27,6 +27,7 @@
>>>>>>>>     #include "qemu/error-report.h"
>>>>>>>>     #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>>>>>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>>>>>>     typedef struct CowRequest {
>>>>>>>>         int64_t start_byte;
>>>>>>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>>>>>>         qemu_co_queue_restart_all(&req->wait_queue);
>>>>>>>>     }
>>>>>>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>>> - * error occurred, return a negative error number */
>>>>>>>> +/*
>>>>>>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>>> + * error occurred, return a negative error number
>>>>>>>> + *
>>>>>>>> + * @bounce_buffer is assumed to enough to store
>>>>>>>
>>>>>>> s/to/to be/
>>>>>>>
>>>>>>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>>>>>>> + */
>>>>>>>>     static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>>>>>>                                                           int64_t start,
>>>>>>>>                                                           int64_t end,
>>>>>>>>                                                           bool is_write_notifier,
>>>>>>>>                                                           bool *error_is_read,
>>>>>>>> -                                                      void **bounce_buffer)
>>>>>>>> +                                                      void *bounce_buffer)
>>>>>>>>     {
>>>>>>>>         int ret;
>>>>>>>>         BlockBackend *blk = job->common.blk;
>>>>>>>> -    int nbytes;
>>>>>>>> +    int nbytes, remaining_bytes;
>>>>>>>>         int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>>>>>>         assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>>>>>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>>>>>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>>>>>>> -    if (!*bounce_buffer) {
>>>>>>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>>>>>>> -    }
>>>>>>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>>>>>>> +    nbytes = MIN(end - start, job->len - start);
>>>>>>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>>>>>>> -    if (ret < 0) {
>>>>>>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>>> -        if (error_is_read) {
>>>>>>>> -            *error_is_read = true;
>>>>>>>> +
>>>>>>>> +    remaining_bytes = nbytes;
>>>>>>>> +    while (remaining_bytes) {
>>>>>>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>>>>>>> +
>>>>>>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>>>>>>> +        if (ret < 0) {
>>>>>>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>>> +            if (error_is_read) {
>>>>>>>> +                *error_is_read = true;
>>>>>>>> +            }
>>>>>>>> +            goto fail;
>>>>>>>>             }
>>>>>>>> -        goto fail;
>>>>>>>> -    }
>>>>>>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>>>>>>> -                        job->write_flags);
>>>>>>>> -    if (ret < 0) {
>>>>>>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>>> -        if (error_is_read) {
>>>>>>>> -            *error_is_read = false;
>>>>>>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>>>>>>> +                            job->write_flags);
>>>>>>>> +        if (ret < 0) {
>>>>>>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>>> +            if (error_is_read) {
>>>>>>>> +                *error_is_read = false;
>>>>>>>> +            }
>>>>>>>> +            goto fail;
>>>>>>>>             }
>>>>>>>> -        goto fail;
>>>>>>>> +
>>>>>>>> +        start += chunk;
>>>>>>>> +        remaining_bytes -= chunk;
>>>>>>>>         }
>>>>>>>>         return nbytes;
>>>>>>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>>>>>>                 }
>>>>>>>>             }
>>>>>>>>             if (!job->use_copy_range) {
>>>>>>>> +            if (!bounce_buffer) {
>>>>>>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>>>>>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>>>>>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>>>>>>> +            }
>>>>>>>
>>>>>>> If you use _try_, you should probably also check whether it succeeded.
>>>>>>
>>>>>> Oops, you are right, of course.
>>>>>>
>>>>>>>
>>>>>>> Anyway, I wonder whether it’d be better to just allocate this buffer
>>>>>>> once per job (the first time we get here, probably) to be of size
>>>>>>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>>>>>>> a buf-size parameter similar to what the mirror jobs have.)
>>>>>>>
>>>>>>
>>>>>> Once per job will not work, as we may have several guest writes in parallel and therefore
>>>>>> several parallel copy-before-write operations.
>>>>>
>>>>> Hm.  I’m not quite happy with that because if the guest just issues many
>>>>> large discards in parallel, this means that qemu will allocate a large
>>>>> amount of memory.
>>>>>
>>>>> It would be nice if there was a simple way to keep track of the total
>>>>> memory usage and let requests yield if they would exceed it.
>>>>
>>>> Agree, it should be fixed anyway.
>>>>
>>>
>>>
>>> But still..
>>>
>>> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
>>>
>>> If we assume that it is correct to double memory usage of guest operations, than for backup
>>> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
>>
>> Well, but that is the problem.  I didn’t say anything in v2, because I
>> only thought of normal writes and I found it fine to double the memory
>> usage there (a guest won’t issue huge write requests in parallel).  But
>> discard/write-zeroes are a different matter.
>>
>>> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
>>> improvement to be after backup-top filter merged.
>>
>> But do you need to distinguish it?  Why not just keep track of memory
>> usage and put the current I/O coroutine to sleep in a CoQueue or
>> something, and wake that up at the end of backup_do_cow()?
>>
> 
> 1. Because if we _can_ allow doubling of memory, it's more effective to not restrict allocations on
> guest writes. It's just seems to be more effective technique.

But the problem with backup and zero writes/discards is that the memory
is not doubled.  The request doesn’t need any memory, but the CBW
operation does, and maybe lots of it.

So the guest may issue many zero writes/discards in parallel and thus
exhaust memory on the host.

> 2. Anyway, I'd allow some always-available size to allocate - let it be one cluster, which will correspond
> to current behavior and prevent guest io hang in worst case.

The guest would only hang if it we have to copy more than e.g. 64 MB at
a time.  At which point I think it’s not unreasonable to sequentialize
requests.

Max

> So I mean, if we have enough memory allow
> individual CBW operation to allocate the whole buffer, and if we have no extra memory allow to allocate one
> cluster.
>
Vladimir Sementsov-Ogievskiy Aug. 13, 2019, 3 p.m. UTC | #9
13.08.2019 17:57, Max Reitz wrote:
> On 13.08.19 16:39, Vladimir Sementsov-Ogievskiy wrote:
>> 13.08.2019 17:23, Max Reitz wrote:
>>> On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:
>>>> 12.08.2019 19:37, Vladimir Sementsov-Ogievskiy wrote:
>>>>> 12.08.2019 19:11, Max Reitz wrote:
>>>>>> On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>> 12.08.2019 18:10, Max Reitz wrote:
>>>>>>>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>>> backup_cow_with_offload can transfer more than one cluster. Let
>>>>>>>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>>>>>>>> of IO requests, since there is no need to copy cluster by cluster.
>>>>>>>>>
>>>>>>>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>>>>>>>> one-cluster-sized buffer to share for all iterations. We can't also
>>>>>>>>> allocate buffer of full-request length it may be too large, so
>>>>>>>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>>>>>>>> is to allocate a buffer sufficient to handle all remaining iterations
>>>>>>>>> at the point where we need the buffer for the first time.
>>>>>>>>>
>>>>>>>>> Bonus: get rid of pointer-to-pointer.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>>>>> ---
>>>>>>>>>      block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>>>>>>>      1 file changed, 41 insertions(+), 24 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/block/backup.c b/block/backup.c
>>>>>>>>> index d482d93458..65f7212c85 100644
>>>>>>>>> --- a/block/backup.c
>>>>>>>>> +++ b/block/backup.c
>>>>>>>>> @@ -27,6 +27,7 @@
>>>>>>>>>      #include "qemu/error-report.h"
>>>>>>>>>      #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>>>>>>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>>>>>>>      typedef struct CowRequest {
>>>>>>>>>          int64_t start_byte;
>>>>>>>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>>>>>>>          qemu_co_queue_restart_all(&req->wait_queue);
>>>>>>>>>      }
>>>>>>>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>>>> - * error occurred, return a negative error number */
>>>>>>>>> +/*
>>>>>>>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>>>> + * error occurred, return a negative error number
>>>>>>>>> + *
>>>>>>>>> + * @bounce_buffer is assumed to enough to store
>>>>>>>>
>>>>>>>> s/to/to be/
>>>>>>>>
>>>>>>>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>>>>>>>> + */
>>>>>>>>>      static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>>>>>>>                                                            int64_t start,
>>>>>>>>>                                                            int64_t end,
>>>>>>>>>                                                            bool is_write_notifier,
>>>>>>>>>                                                            bool *error_is_read,
>>>>>>>>> -                                                      void **bounce_buffer)
>>>>>>>>> +                                                      void *bounce_buffer)
>>>>>>>>>      {
>>>>>>>>>          int ret;
>>>>>>>>>          BlockBackend *blk = job->common.blk;
>>>>>>>>> -    int nbytes;
>>>>>>>>> +    int nbytes, remaining_bytes;
>>>>>>>>>          int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>>>>>>>          assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>>>>>>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>>>>>>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>>>>>>>> -    if (!*bounce_buffer) {
>>>>>>>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>>>>>>>> -    }
>>>>>>>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>>>>>>>> +    nbytes = MIN(end - start, job->len - start);
>>>>>>>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>>>>>>>> -    if (ret < 0) {
>>>>>>>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>>>> -        if (error_is_read) {
>>>>>>>>> -            *error_is_read = true;
>>>>>>>>> +
>>>>>>>>> +    remaining_bytes = nbytes;
>>>>>>>>> +    while (remaining_bytes) {
>>>>>>>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>>>>>>>> +
>>>>>>>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>>>>>>>> +        if (ret < 0) {
>>>>>>>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>>>> +            if (error_is_read) {
>>>>>>>>> +                *error_is_read = true;
>>>>>>>>> +            }
>>>>>>>>> +            goto fail;
>>>>>>>>>              }
>>>>>>>>> -        goto fail;
>>>>>>>>> -    }
>>>>>>>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>>>>>>>> -                        job->write_flags);
>>>>>>>>> -    if (ret < 0) {
>>>>>>>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>>>> -        if (error_is_read) {
>>>>>>>>> -            *error_is_read = false;
>>>>>>>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>>>>>>>> +                            job->write_flags);
>>>>>>>>> +        if (ret < 0) {
>>>>>>>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>>>> +            if (error_is_read) {
>>>>>>>>> +                *error_is_read = false;
>>>>>>>>> +            }
>>>>>>>>> +            goto fail;
>>>>>>>>>              }
>>>>>>>>> -        goto fail;
>>>>>>>>> +
>>>>>>>>> +        start += chunk;
>>>>>>>>> +        remaining_bytes -= chunk;
>>>>>>>>>          }
>>>>>>>>>          return nbytes;
>>>>>>>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>>>>>>>                  }
>>>>>>>>>              }
>>>>>>>>>              if (!job->use_copy_range) {
>>>>>>>>> +            if (!bounce_buffer) {
>>>>>>>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>>>>>>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>>>>>>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>>>>>>>> +            }
>>>>>>>>
>>>>>>>> If you use _try_, you should probably also check whether it succeeded.
>>>>>>>
>>>>>>> Oops, you are right, of course.
>>>>>>>
>>>>>>>>
>>>>>>>> Anyway, I wonder whether it’d be better to just allocate this buffer
>>>>>>>> once per job (the first time we get here, probably) to be of size
>>>>>>>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>>>>>>>> a buf-size parameter similar to what the mirror jobs have.)
>>>>>>>>
>>>>>>>
>>>>>>> Once per job will not work, as we may have several guest writes in parallel and therefore
>>>>>>> several parallel copy-before-write operations.
>>>>>>
>>>>>> Hm.  I’m not quite happy with that because if the guest just issues many
>>>>>> large discards in parallel, this means that qemu will allocate a large
>>>>>> amount of memory.
>>>>>>
>>>>>> It would be nice if there was a simple way to keep track of the total
>>>>>> memory usage and let requests yield if they would exceed it.
>>>>>
>>>>> Agree, it should be fixed anyway.
>>>>>
>>>>
>>>>
>>>> But still..
>>>>
>>>> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
>>>>
>>>> If we assume that it is correct to double memory usage of guest operations, than for backup
>>>> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
>>>
>>> Well, but that is the problem.  I didn’t say anything in v2, because I
>>> only thought of normal writes and I found it fine to double the memory
>>> usage there (a guest won’t issue huge write requests in parallel).  But
>>> discard/write-zeroes are a different matter.
>>>
>>>> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
>>>> improvement to be after backup-top filter merged.
>>>
>>> But do you need to distinguish it?  Why not just keep track of memory
>>> usage and put the current I/O coroutine to sleep in a CoQueue or
>>> something, and wake that up at the end of backup_do_cow()?
>>>
>>
>> 1. Because if we _can_ allow doubling of memory, it's more effective to not restrict allocations on
>> guest writes. It's just seems to be more effective technique.
> 
> But the problem with backup and zero writes/discards is that the memory
> is not doubled.  The request doesn’t need any memory, but the CBW
> operation does, and maybe lots of it.
> 
> So the guest may issue many zero writes/discards in parallel and thus
> exhaust memory on the host.

So this is the reason to separate writes from write-zeros/discrads. So at least write will be happy. And I
think that write is more often request than write-zero/discard

> 
>> 2. Anyway, I'd allow some always-available size to allocate - let it be one cluster, which will correspond
>> to current behavior and prevent guest io hang in worst case.
> 
> The guest would only hang if it we have to copy more than e.g. 64 MB at
> a time.  At which point I think it’s not unreasonable to sequentialize
> requests.
>
Max Reitz Aug. 13, 2019, 3:02 p.m. UTC | #10
On 13.08.19 17:00, Vladimir Sementsov-Ogievskiy wrote:
> 13.08.2019 17:57, Max Reitz wrote:
>> On 13.08.19 16:39, Vladimir Sementsov-Ogievskiy wrote:
>>> 13.08.2019 17:23, Max Reitz wrote:
>>>> On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:
>>>>> 12.08.2019 19:37, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> 12.08.2019 19:11, Max Reitz wrote:
>>>>>>> On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>> 12.08.2019 18:10, Max Reitz wrote:
>>>>>>>>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>>>> backup_cow_with_offload can transfer more than one cluster. Let
>>>>>>>>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>>>>>>>>> of IO requests, since there is no need to copy cluster by cluster.
>>>>>>>>>>
>>>>>>>>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>>>>>>>>> one-cluster-sized buffer to share for all iterations. We can't also
>>>>>>>>>> allocate buffer of full-request length it may be too large, so
>>>>>>>>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>>>>>>>>> is to allocate a buffer sufficient to handle all remaining iterations
>>>>>>>>>> at the point where we need the buffer for the first time.
>>>>>>>>>>
>>>>>>>>>> Bonus: get rid of pointer-to-pointer.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>>>>>> ---
>>>>>>>>>>      block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>>>>>>>>      1 file changed, 41 insertions(+), 24 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/block/backup.c b/block/backup.c
>>>>>>>>>> index d482d93458..65f7212c85 100644
>>>>>>>>>> --- a/block/backup.c
>>>>>>>>>> +++ b/block/backup.c
>>>>>>>>>> @@ -27,6 +27,7 @@
>>>>>>>>>>      #include "qemu/error-report.h"
>>>>>>>>>>      #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>>>>>>>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>>>>>>>>      typedef struct CowRequest {
>>>>>>>>>>          int64_t start_byte;
>>>>>>>>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>>>>>>>>          qemu_co_queue_restart_all(&req->wait_queue);
>>>>>>>>>>      }
>>>>>>>>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>>>>> - * error occurred, return a negative error number */
>>>>>>>>>> +/*
>>>>>>>>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>>>>> + * error occurred, return a negative error number
>>>>>>>>>> + *
>>>>>>>>>> + * @bounce_buffer is assumed to enough to store
>>>>>>>>>
>>>>>>>>> s/to/to be/
>>>>>>>>>
>>>>>>>>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>>>>>>>>> + */
>>>>>>>>>>      static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>>>>>>>>                                                            int64_t start,
>>>>>>>>>>                                                            int64_t end,
>>>>>>>>>>                                                            bool is_write_notifier,
>>>>>>>>>>                                                            bool *error_is_read,
>>>>>>>>>> -                                                      void **bounce_buffer)
>>>>>>>>>> +                                                      void *bounce_buffer)
>>>>>>>>>>      {
>>>>>>>>>>          int ret;
>>>>>>>>>>          BlockBackend *blk = job->common.blk;
>>>>>>>>>> -    int nbytes;
>>>>>>>>>> +    int nbytes, remaining_bytes;
>>>>>>>>>>          int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>>>>>>>>          assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>>>>>>>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>>>>>>>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>>>>>>>>> -    if (!*bounce_buffer) {
>>>>>>>>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>>>>>>>>> -    }
>>>>>>>>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>>>>>>>>> +    nbytes = MIN(end - start, job->len - start);
>>>>>>>>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>>>>>>>>> -    if (ret < 0) {
>>>>>>>>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>>>>> -        if (error_is_read) {
>>>>>>>>>> -            *error_is_read = true;
>>>>>>>>>> +
>>>>>>>>>> +    remaining_bytes = nbytes;
>>>>>>>>>> +    while (remaining_bytes) {
>>>>>>>>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>>>>>>>>> +
>>>>>>>>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>>>>>>>>> +        if (ret < 0) {
>>>>>>>>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>>>>> +            if (error_is_read) {
>>>>>>>>>> +                *error_is_read = true;
>>>>>>>>>> +            }
>>>>>>>>>> +            goto fail;
>>>>>>>>>>              }
>>>>>>>>>> -        goto fail;
>>>>>>>>>> -    }
>>>>>>>>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>>>>>>>>> -                        job->write_flags);
>>>>>>>>>> -    if (ret < 0) {
>>>>>>>>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>>>>> -        if (error_is_read) {
>>>>>>>>>> -            *error_is_read = false;
>>>>>>>>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>>>>>>>>> +                            job->write_flags);
>>>>>>>>>> +        if (ret < 0) {
>>>>>>>>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>>>>> +            if (error_is_read) {
>>>>>>>>>> +                *error_is_read = false;
>>>>>>>>>> +            }
>>>>>>>>>> +            goto fail;
>>>>>>>>>>              }
>>>>>>>>>> -        goto fail;
>>>>>>>>>> +
>>>>>>>>>> +        start += chunk;
>>>>>>>>>> +        remaining_bytes -= chunk;
>>>>>>>>>>          }
>>>>>>>>>>          return nbytes;
>>>>>>>>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>>>>>>>>                  }
>>>>>>>>>>              }
>>>>>>>>>>              if (!job->use_copy_range) {
>>>>>>>>>> +            if (!bounce_buffer) {
>>>>>>>>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>>>>>>>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>>>>>>>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>>>>>>>>> +            }
>>>>>>>>>
>>>>>>>>> If you use _try_, you should probably also check whether it succeeded.
>>>>>>>>
>>>>>>>> Oops, you are right, of course.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> Anyway, I wonder whether it’d be better to just allocate this buffer
>>>>>>>>> once per job (the first time we get here, probably) to be of size
>>>>>>>>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>>>>>>>>> a buf-size parameter similar to what the mirror jobs have.)
>>>>>>>>>
>>>>>>>>
>>>>>>>> Once per job will not work, as we may have several guest writes in parallel and therefore
>>>>>>>> several parallel copy-before-write operations.
>>>>>>>
>>>>>>> Hm.  I’m not quite happy with that because if the guest just issues many
>>>>>>> large discards in parallel, this means that qemu will allocate a large
>>>>>>> amount of memory.
>>>>>>>
>>>>>>> It would be nice if there was a simple way to keep track of the total
>>>>>>> memory usage and let requests yield if they would exceed it.
>>>>>>
>>>>>> Agree, it should be fixed anyway.
>>>>>>
>>>>>
>>>>>
>>>>> But still..
>>>>>
>>>>> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
>>>>>
>>>>> If we assume that it is correct to double memory usage of guest operations, than for backup
>>>>> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
>>>>
>>>> Well, but that is the problem.  I didn’t say anything in v2, because I
>>>> only thought of normal writes and I found it fine to double the memory
>>>> usage there (a guest won’t issue huge write requests in parallel).  But
>>>> discard/write-zeroes are a different matter.
>>>>
>>>>> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
>>>>> improvement to be after backup-top filter merged.
>>>>
>>>> But do you need to distinguish it?  Why not just keep track of memory
>>>> usage and put the current I/O coroutine to sleep in a CoQueue or
>>>> something, and wake that up at the end of backup_do_cow()?
>>>>
>>>
>>> 1. Because if we _can_ allow doubling of memory, it's more effective to not restrict allocations on
>>> guest writes. It's just seems to be more effective technique.
>>
>> But the problem with backup and zero writes/discards is that the memory
>> is not doubled.  The request doesn’t need any memory, but the CBW
>> operation does, and maybe lots of it.
>>
>> So the guest may issue many zero writes/discards in parallel and thus
>> exhaust memory on the host.
> 
> So this is the reason to separate writes from write-zeros/discrads. So at least write will be happy. And I
> think that write is more often request than write-zero/discard

But that makes it complicated for no practical gain whatsoever.

>>
>>> 2. Anyway, I'd allow some always-available size to allocate - let it be one cluster, which will correspond
>>> to current behavior and prevent guest io hang in worst case.
>>
>> The guest would only hang if it we have to copy more than e.g. 64 MB at
>> a time.  At which point I think it’s not unreasonable to sequentialize
>> requests.

Because of this.  How is it bad to start sequentializing writes when the
data exceeds 64 MB?

Max
Vladimir Sementsov-Ogievskiy Aug. 13, 2019, 3:32 p.m. UTC | #11
13.08.2019 18:02, Max Reitz wrote:
> On 13.08.19 17:00, Vladimir Sementsov-Ogievskiy wrote:
>> 13.08.2019 17:57, Max Reitz wrote:
>>> On 13.08.19 16:39, Vladimir Sementsov-Ogievskiy wrote:
>>>> 13.08.2019 17:23, Max Reitz wrote:
>>>>> On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> 12.08.2019 19:37, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>> 12.08.2019 19:11, Max Reitz wrote:
>>>>>>>> On 12.08.19 17:47, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>>> 12.08.2019 18:10, Max Reitz wrote:
>>>>>>>>>> On 10.08.19 21:31, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>>>>> backup_cow_with_offload can transfer more than one cluster. Let
>>>>>>>>>>> backup_cow_with_bounce_buffer behave similarly. It reduces the number
>>>>>>>>>>> of IO requests, since there is no need to copy cluster by cluster.
>>>>>>>>>>>
>>>>>>>>>>> Logic around bounce_buffer allocation changed: we can't just allocate
>>>>>>>>>>> one-cluster-sized buffer to share for all iterations. We can't also
>>>>>>>>>>> allocate buffer of full-request length it may be too large, so
>>>>>>>>>>> BACKUP_MAX_BOUNCE_BUFFER is introduced. And finally, allocation logic
>>>>>>>>>>> is to allocate a buffer sufficient to handle all remaining iterations
>>>>>>>>>>> at the point where we need the buffer for the first time.
>>>>>>>>>>>
>>>>>>>>>>> Bonus: get rid of pointer-to-pointer.
>>>>>>>>>>>
>>>>>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>>>>>>> ---
>>>>>>>>>>>       block/backup.c | 65 +++++++++++++++++++++++++++++++-------------------
>>>>>>>>>>>       1 file changed, 41 insertions(+), 24 deletions(-)
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/block/backup.c b/block/backup.c
>>>>>>>>>>> index d482d93458..65f7212c85 100644
>>>>>>>>>>> --- a/block/backup.c
>>>>>>>>>>> +++ b/block/backup.c
>>>>>>>>>>> @@ -27,6 +27,7 @@
>>>>>>>>>>>       #include "qemu/error-report.h"
>>>>>>>>>>>       #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
>>>>>>>>>>> +#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
>>>>>>>>>>>       typedef struct CowRequest {
>>>>>>>>>>>           int64_t start_byte;
>>>>>>>>>>> @@ -98,44 +99,55 @@ static void cow_request_end(CowRequest *req)
>>>>>>>>>>>           qemu_co_queue_restart_all(&req->wait_queue);
>>>>>>>>>>>       }
>>>>>>>>>>> -/* Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>>>>>> - * error occurred, return a negative error number */
>>>>>>>>>>> +/*
>>>>>>>>>>> + * Copy range to target with a bounce buffer and return the bytes copied. If
>>>>>>>>>>> + * error occurred, return a negative error number
>>>>>>>>>>> + *
>>>>>>>>>>> + * @bounce_buffer is assumed to enough to store
>>>>>>>>>>
>>>>>>>>>> s/to/to be/
>>>>>>>>>>
>>>>>>>>>>> + * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
>>>>>>>>>>> + */
>>>>>>>>>>>       static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>>>>>>>>>>>                                                             int64_t start,
>>>>>>>>>>>                                                             int64_t end,
>>>>>>>>>>>                                                             bool is_write_notifier,
>>>>>>>>>>>                                                             bool *error_is_read,
>>>>>>>>>>> -                                                      void **bounce_buffer)
>>>>>>>>>>> +                                                      void *bounce_buffer)
>>>>>>>>>>>       {
>>>>>>>>>>>           int ret;
>>>>>>>>>>>           BlockBackend *blk = job->common.blk;
>>>>>>>>>>> -    int nbytes;
>>>>>>>>>>> +    int nbytes, remaining_bytes;
>>>>>>>>>>>           int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>>>>>>>>>>           assert(QEMU_IS_ALIGNED(start, job->cluster_size));
>>>>>>>>>>> -    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
>>>>>>>>>>> -    nbytes = MIN(job->cluster_size, job->len - start);
>>>>>>>>>>> -    if (!*bounce_buffer) {
>>>>>>>>>>> -        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
>>>>>>>>>>> -    }
>>>>>>>>>>> +    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
>>>>>>>>>>> +    nbytes = MIN(end - start, job->len - start);
>>>>>>>>>>> -    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
>>>>>>>>>>> -    if (ret < 0) {
>>>>>>>>>>> -        trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>>>>>> -        if (error_is_read) {
>>>>>>>>>>> -            *error_is_read = true;
>>>>>>>>>>> +
>>>>>>>>>>> +    remaining_bytes = nbytes;
>>>>>>>>>>> +    while (remaining_bytes) {
>>>>>>>>>>> +        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
>>>>>>>>>>> +
>>>>>>>>>>> +        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
>>>>>>>>>>> +        if (ret < 0) {
>>>>>>>>>>> +            trace_backup_do_cow_read_fail(job, start, ret);
>>>>>>>>>>> +            if (error_is_read) {
>>>>>>>>>>> +                *error_is_read = true;
>>>>>>>>>>> +            }
>>>>>>>>>>> +            goto fail;
>>>>>>>>>>>               }
>>>>>>>>>>> -        goto fail;
>>>>>>>>>>> -    }
>>>>>>>>>>> -    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
>>>>>>>>>>> -                        job->write_flags);
>>>>>>>>>>> -    if (ret < 0) {
>>>>>>>>>>> -        trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>>>>>> -        if (error_is_read) {
>>>>>>>>>>> -            *error_is_read = false;
>>>>>>>>>>> +        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
>>>>>>>>>>> +                            job->write_flags);
>>>>>>>>>>> +        if (ret < 0) {
>>>>>>>>>>> +            trace_backup_do_cow_write_fail(job, start, ret);
>>>>>>>>>>> +            if (error_is_read) {
>>>>>>>>>>> +                *error_is_read = false;
>>>>>>>>>>> +            }
>>>>>>>>>>> +            goto fail;
>>>>>>>>>>>               }
>>>>>>>>>>> -        goto fail;
>>>>>>>>>>> +
>>>>>>>>>>> +        start += chunk;
>>>>>>>>>>> +        remaining_bytes -= chunk;
>>>>>>>>>>>           }
>>>>>>>>>>>           return nbytes;
>>>>>>>>>>> @@ -301,9 +313,14 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>>>>>>>>>>                   }
>>>>>>>>>>>               }
>>>>>>>>>>>               if (!job->use_copy_range) {
>>>>>>>>>>> +            if (!bounce_buffer) {
>>>>>>>>>>> +                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
>>>>>>>>>>> +                                 MAX(dirty_end - start, end - dirty_end));
>>>>>>>>>>> +                bounce_buffer = blk_try_blockalign(job->common.blk, len);
>>>>>>>>>>> +            }
>>>>>>>>>>
>>>>>>>>>> If you use _try_, you should probably also check whether it succeeded.
>>>>>>>>>
>>>>>>>>> Oops, you are right, of course.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Anyway, I wonder whether it’d be better to just allocate this buffer
>>>>>>>>>> once per job (the first time we get here, probably) to be of size
>>>>>>>>>> BACKUP_MAX_BOUNCE_BUFFER and put it into BackupBlockJob.  (And maybe add
>>>>>>>>>> a buf-size parameter similar to what the mirror jobs have.)
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Once per job will not work, as we may have several guest writes in parallel and therefore
>>>>>>>>> several parallel copy-before-write operations.
>>>>>>>>
>>>>>>>> Hm.  I’m not quite happy with that because if the guest just issues many
>>>>>>>> large discards in parallel, this means that qemu will allocate a large
>>>>>>>> amount of memory.
>>>>>>>>
>>>>>>>> It would be nice if there was a simple way to keep track of the total
>>>>>>>> memory usage and let requests yield if they would exceed it.
>>>>>>>
>>>>>>> Agree, it should be fixed anyway.
>>>>>>>
>>>>>>
>>>>>>
>>>>>> But still..
>>>>>>
>>>>>> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
>>>>>>
>>>>>> If we assume that it is correct to double memory usage of guest operations, than for backup
>>>>>> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
>>>>>
>>>>> Well, but that is the problem.  I didn’t say anything in v2, because I
>>>>> only thought of normal writes and I found it fine to double the memory
>>>>> usage there (a guest won’t issue huge write requests in parallel).  But
>>>>> discard/write-zeroes are a different matter.
>>>>>
>>>>>> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
>>>>>> improvement to be after backup-top filter merged.
>>>>>
>>>>> But do you need to distinguish it?  Why not just keep track of memory
>>>>> usage and put the current I/O coroutine to sleep in a CoQueue or
>>>>> something, and wake that up at the end of backup_do_cow()?
>>>>>
>>>>
>>>> 1. Because if we _can_ allow doubling of memory, it's more effective to not restrict allocations on
>>>> guest writes. It's just seems to be more effective technique.
>>>
>>> But the problem with backup and zero writes/discards is that the memory
>>> is not doubled.  The request doesn’t need any memory, but the CBW
>>> operation does, and maybe lots of it.
>>>
>>> So the guest may issue many zero writes/discards in parallel and thus
>>> exhaust memory on the host.
>>
>> So this is the reason to separate writes from write-zeros/discrads. So at least write will be happy. And I
>> think that write is more often request than write-zero/discard
> 
> But that makes it complicated for no practical gain whatsoever.
> 
>>>
>>>> 2. Anyway, I'd allow some always-available size to allocate - let it be one cluster, which will correspond
>>>> to current behavior and prevent guest io hang in worst case.
>>>
>>> The guest would only hang if it we have to copy more than e.g. 64 MB at
>>> a time.  At which point I think it’s not unreasonable to sequentialize
>>> requests.
> 
> Because of this.  How is it bad to start sequentializing writes when the
> data exceeds 64 MB?
> 

So you want total memory limit of 64 MB? (with possible parameter like in mirror)

And allocation algorithm to copy count bytes:

if free_mem >= count: allocate count bytes
else if free_mem >= cluster: allocate cluster and copy in a loop
else wait in co-queue until some memory available and retry

Is it OK for you?
Max Reitz Aug. 13, 2019, 4:30 p.m. UTC | #12
On 13.08.19 17:32, Vladimir Sementsov-Ogievskiy wrote:
> 13.08.2019 18:02, Max Reitz wrote:
>> On 13.08.19 17:00, Vladimir Sementsov-Ogievskiy wrote:
>>> 13.08.2019 17:57, Max Reitz wrote:
>>>> On 13.08.19 16:39, Vladimir Sementsov-Ogievskiy wrote:
>>>>> 13.08.2019 17:23, Max Reitz wrote:
>>>>>> On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:

[...]

>>>>>>> But still..
>>>>>>>
>>>>>>> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
>>>>>>>
>>>>>>> If we assume that it is correct to double memory usage of guest operations, than for backup
>>>>>>> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
>>>>>>
>>>>>> Well, but that is the problem.  I didn’t say anything in v2, because I
>>>>>> only thought of normal writes and I found it fine to double the memory
>>>>>> usage there (a guest won’t issue huge write requests in parallel).  But
>>>>>> discard/write-zeroes are a different matter.
>>>>>>
>>>>>>> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
>>>>>>> improvement to be after backup-top filter merged.
>>>>>>
>>>>>> But do you need to distinguish it?  Why not just keep track of memory
>>>>>> usage and put the current I/O coroutine to sleep in a CoQueue or
>>>>>> something, and wake that up at the end of backup_do_cow()?
>>>>>>
>>>>>
>>>>> 1. Because if we _can_ allow doubling of memory, it's more effective to not restrict allocations on
>>>>> guest writes. It's just seems to be more effective technique.
>>>>
>>>> But the problem with backup and zero writes/discards is that the memory
>>>> is not doubled.  The request doesn’t need any memory, but the CBW
>>>> operation does, and maybe lots of it.
>>>>
>>>> So the guest may issue many zero writes/discards in parallel and thus
>>>> exhaust memory on the host.
>>>
>>> So this is the reason to separate writes from write-zeros/discrads. So at least write will be happy. And I
>>> think that write is more often request than write-zero/discard
>>
>> But that makes it complicated for no practical gain whatsoever.
>>
>>>>
>>>>> 2. Anyway, I'd allow some always-available size to allocate - let it be one cluster, which will correspond
>>>>> to current behavior and prevent guest io hang in worst case.
>>>>
>>>> The guest would only hang if it we have to copy more than e.g. 64 MB at
>>>> a time.  At which point I think it’s not unreasonable to sequentialize
>>>> requests.
>>
>> Because of this.  How is it bad to start sequentializing writes when the
>> data exceeds 64 MB?
>>
> 
> So you want total memory limit of 64 MB? (with possible parameter like in mirror)
> 
> And allocation algorithm to copy count bytes:
> 
> if free_mem >= count: allocate count bytes
> else if free_mem >= cluster: allocate cluster and copy in a loop
> else wait in co-queue until some memory available and retry
> 
> Is it OK for you?

Sounds good to me, although I don’t know whether the second branch is
necessary.  As I’ve said, the total limit is just an insurance against a
guest that does some crazy stuff.

Max
Vladimir Sementsov-Ogievskiy Aug. 13, 2019, 4:45 p.m. UTC | #13
13.08.2019 19:30, Max Reitz wrote:
> On 13.08.19 17:32, Vladimir Sementsov-Ogievskiy wrote:
>> 13.08.2019 18:02, Max Reitz wrote:
>>> On 13.08.19 17:00, Vladimir Sementsov-Ogievskiy wrote:
>>>> 13.08.2019 17:57, Max Reitz wrote:
>>>>> On 13.08.19 16:39, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> 13.08.2019 17:23, Max Reitz wrote:
>>>>>>> On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:
> 
> [...]
> 
>>>>>>>> But still..
>>>>>>>>
>>>>>>>> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
>>>>>>>>
>>>>>>>> If we assume that it is correct to double memory usage of guest operations, than for backup
>>>>>>>> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
>>>>>>>
>>>>>>> Well, but that is the problem.  I didn’t say anything in v2, because I
>>>>>>> only thought of normal writes and I found it fine to double the memory
>>>>>>> usage there (a guest won’t issue huge write requests in parallel).  But
>>>>>>> discard/write-zeroes are a different matter.
>>>>>>>
>>>>>>>> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
>>>>>>>> improvement to be after backup-top filter merged.
>>>>>>>
>>>>>>> But do you need to distinguish it?  Why not just keep track of memory
>>>>>>> usage and put the current I/O coroutine to sleep in a CoQueue or
>>>>>>> something, and wake that up at the end of backup_do_cow()?
>>>>>>>
>>>>>>
>>>>>> 1. Because if we _can_ allow doubling of memory, it's more effective to not restrict allocations on
>>>>>> guest writes. It's just seems to be more effective technique.
>>>>>
>>>>> But the problem with backup and zero writes/discards is that the memory
>>>>> is not doubled.  The request doesn’t need any memory, but the CBW
>>>>> operation does, and maybe lots of it.
>>>>>
>>>>> So the guest may issue many zero writes/discards in parallel and thus
>>>>> exhaust memory on the host.
>>>>
>>>> So this is the reason to separate writes from write-zeros/discrads. So at least write will be happy. And I
>>>> think that write is more often request than write-zero/discard
>>>
>>> But that makes it complicated for no practical gain whatsoever.
>>>
>>>>>
>>>>>> 2. Anyway, I'd allow some always-available size to allocate - let it be one cluster, which will correspond
>>>>>> to current behavior and prevent guest io hang in worst case.
>>>>>
>>>>> The guest would only hang if it we have to copy more than e.g. 64 MB at
>>>>> a time.  At which point I think it’s not unreasonable to sequentialize
>>>>> requests.
>>>
>>> Because of this.  How is it bad to start sequentializing writes when the
>>> data exceeds 64 MB?
>>>
>>
>> So you want total memory limit of 64 MB? (with possible parameter like in mirror)
>>
>> And allocation algorithm to copy count bytes:
>>
>> if free_mem >= count: allocate count bytes
>> else if free_mem >= cluster: allocate cluster and copy in a loop
>> else wait in co-queue until some memory available and retry
>>
>> Is it OK for you?
> 
> Sounds good to me, although I don’t know whether the second branch is
> necessary.  As I’ve said, the total limit is just an insurance against a
> guest that does some crazy stuff.
> 

I'm afraid that if there would be one big request it may wait forever while smaller
requests will eat most of available memory. So it would be unfair queue: smaller
requests will have higher priority in low memory case. With [2] it becomes more fair.
Max Reitz Aug. 13, 2019, 4:51 p.m. UTC | #14
On 13.08.19 18:45, Vladimir Sementsov-Ogievskiy wrote:
> 13.08.2019 19:30, Max Reitz wrote:
>> On 13.08.19 17:32, Vladimir Sementsov-Ogievskiy wrote:
>>> 13.08.2019 18:02, Max Reitz wrote:
>>>> On 13.08.19 17:00, Vladimir Sementsov-Ogievskiy wrote:
>>>>> 13.08.2019 17:57, Max Reitz wrote:
>>>>>> On 13.08.19 16:39, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>> 13.08.2019 17:23, Max Reitz wrote:
>>>>>>>> On 13.08.19 16:14, Vladimir Sementsov-Ogievskiy wrote:
>>
>> [...]
>>
>>>>>>>>> But still..
>>>>>>>>>
>>>>>>>>> Synchronous mirror allocates full-request buffers on guest write. Is it correct?
>>>>>>>>>
>>>>>>>>> If we assume that it is correct to double memory usage of guest operations, than for backup
>>>>>>>>> the problem is only in write_zero and discard where guest-assumed memory usage should be zero.
>>>>>>>>
>>>>>>>> Well, but that is the problem.  I didn’t say anything in v2, because I
>>>>>>>> only thought of normal writes and I found it fine to double the memory
>>>>>>>> usage there (a guest won’t issue huge write requests in parallel).  But
>>>>>>>> discard/write-zeroes are a different matter.
>>>>>>>>
>>>>>>>>> And if we should distinguish writes from write_zeroes and discard, it's better to postpone this
>>>>>>>>> improvement to be after backup-top filter merged.
>>>>>>>>
>>>>>>>> But do you need to distinguish it?  Why not just keep track of memory
>>>>>>>> usage and put the current I/O coroutine to sleep in a CoQueue or
>>>>>>>> something, and wake that up at the end of backup_do_cow()?
>>>>>>>>
>>>>>>>
>>>>>>> 1. Because if we _can_ allow doubling of memory, it's more effective to not restrict allocations on
>>>>>>> guest writes. It's just seems to be more effective technique.
>>>>>>
>>>>>> But the problem with backup and zero writes/discards is that the memory
>>>>>> is not doubled.  The request doesn’t need any memory, but the CBW
>>>>>> operation does, and maybe lots of it.
>>>>>>
>>>>>> So the guest may issue many zero writes/discards in parallel and thus
>>>>>> exhaust memory on the host.
>>>>>
>>>>> So this is the reason to separate writes from write-zeros/discrads. So at least write will be happy. And I
>>>>> think that write is more often request than write-zero/discard
>>>>
>>>> But that makes it complicated for no practical gain whatsoever.
>>>>
>>>>>>
>>>>>>> 2. Anyway, I'd allow some always-available size to allocate - let it be one cluster, which will correspond
>>>>>>> to current behavior and prevent guest io hang in worst case.
>>>>>>
>>>>>> The guest would only hang if it we have to copy more than e.g. 64 MB at
>>>>>> a time.  At which point I think it’s not unreasonable to sequentialize
>>>>>> requests.
>>>>
>>>> Because of this.  How is it bad to start sequentializing writes when the
>>>> data exceeds 64 MB?
>>>>
>>>
>>> So you want total memory limit of 64 MB? (with possible parameter like in mirror)
>>>
>>> And allocation algorithm to copy count bytes:
>>>
>>> if free_mem >= count: allocate count bytes
>>> else if free_mem >= cluster: allocate cluster and copy in a loop
>>> else wait in co-queue until some memory available and retry
>>>
>>> Is it OK for you?
>>
>> Sounds good to me, although I don’t know whether the second branch is
>> necessary.  As I’ve said, the total limit is just an insurance against a
>> guest that does some crazy stuff.
>>
> 
> I'm afraid that if there would be one big request it may wait forever while smaller
> requests will eat most of available memory. So it would be unfair queue: smaller
> requests will have higher priority in low memory case. With [2] it becomes more fair.

OK.  Sounds reasonable.

Max
diff mbox series

Patch

diff --git a/block/backup.c b/block/backup.c
index d482d93458..65f7212c85 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -27,6 +27,7 @@ 
 #include "qemu/error-report.h"
 
 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
+#define BACKUP_MAX_BOUNCE_BUFFER (64 * 1024 * 1024)
 
 typedef struct CowRequest {
     int64_t start_byte;
@@ -98,44 +99,55 @@  static void cow_request_end(CowRequest *req)
     qemu_co_queue_restart_all(&req->wait_queue);
 }
 
-/* Copy range to target with a bounce buffer and return the bytes copied. If
- * error occurred, return a negative error number */
+/*
+ * Copy range to target with a bounce buffer and return the bytes copied. If
+ * error occurred, return a negative error number
+ *
+ * @bounce_buffer is assumed to enough to store
+ * MIN(BACKUP_MAX_BOUNCE_BUFFER, @end - @start) bytes
+ */
 static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
                                                       int64_t start,
                                                       int64_t end,
                                                       bool is_write_notifier,
                                                       bool *error_is_read,
-                                                      void **bounce_buffer)
+                                                      void *bounce_buffer)
 {
     int ret;
     BlockBackend *blk = job->common.blk;
-    int nbytes;
+    int nbytes, remaining_bytes;
     int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
 
     assert(QEMU_IS_ALIGNED(start, job->cluster_size));
-    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
-    nbytes = MIN(job->cluster_size, job->len - start);
-    if (!*bounce_buffer) {
-        *bounce_buffer = blk_blockalign(blk, job->cluster_size);
-    }
+    bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
+    nbytes = MIN(end - start, job->len - start);
 
-    ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
-    if (ret < 0) {
-        trace_backup_do_cow_read_fail(job, start, ret);
-        if (error_is_read) {
-            *error_is_read = true;
+
+    remaining_bytes = nbytes;
+    while (remaining_bytes) {
+        int chunk = MIN(BACKUP_MAX_BOUNCE_BUFFER, remaining_bytes);
+
+        ret = blk_co_pread(blk, start, chunk, bounce_buffer, read_flags);
+        if (ret < 0) {
+            trace_backup_do_cow_read_fail(job, start, ret);
+            if (error_is_read) {
+                *error_is_read = true;
+            }
+            goto fail;
         }
-        goto fail;
-    }
 
-    ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
-                        job->write_flags);
-    if (ret < 0) {
-        trace_backup_do_cow_write_fail(job, start, ret);
-        if (error_is_read) {
-            *error_is_read = false;
+        ret = blk_co_pwrite(job->target, start, chunk, bounce_buffer,
+                            job->write_flags);
+        if (ret < 0) {
+            trace_backup_do_cow_write_fail(job, start, ret);
+            if (error_is_read) {
+                *error_is_read = false;
+            }
+            goto fail;
         }
-        goto fail;
+
+        start += chunk;
+        remaining_bytes -= chunk;
     }
 
     return nbytes;
@@ -301,9 +313,14 @@  static int coroutine_fn backup_do_cow(BackupBlockJob *job,
             }
         }
         if (!job->use_copy_range) {
+            if (!bounce_buffer) {
+                size_t len = MIN(BACKUP_MAX_BOUNCE_BUFFER,
+                                 MAX(dirty_end - start, end - dirty_end));
+                bounce_buffer = blk_try_blockalign(job->common.blk, len);
+            }
             ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
                                                 is_write_notifier,
-                                                error_is_read, &bounce_buffer);
+                                                error_is_read, bounce_buffer);
         }
         if (ret < 0) {
             break;