diff mbox series

[v10,8/9] qcow2: skip writing zero buffers to empty COW areas

Message ID 20181203101429.88735-9-anton.nefedov@virtuozzo.com
State New
Headers show
Series qcow2: cluster space preallocation | expand

Commit Message

Anton Nefedov Dec. 3, 2018, 10:14 a.m. UTC
If COW areas of the newly allocated clusters are zeroes on the backing image,
efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
cluster instead of writing explicit zero buffers later in perform_cow().

iotest 060:
write to the discarded cluster does not trigger COW anymore.
Use a backing image instead.

Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
---
 qapi/block-core.json       |  4 +-
 block/qcow2.h              |  6 +++
 block/qcow2-cluster.c      |  2 +-
 block/qcow2.c              | 80 +++++++++++++++++++++++++++++++++++++-
 block/trace-events         |  1 +
 tests/qemu-iotests/060     | 26 ++++++++-----
 tests/qemu-iotests/060.out |  5 ++-
 7 files changed, 109 insertions(+), 15 deletions(-)

Comments

Alberto Garcia Dec. 3, 2018, 1:59 p.m. UTC | #1
On Mon 03 Dec 2018 11:14:59 AM CET, Anton Nefedov wrote:
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -3004,6 +3004,8 @@
>  #
>  # @cor_write: a write due to copy-on-read (since 2.11)
>  #
> +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)

Since 4.0 ??

Berto
Anton Nefedov Dec. 3, 2018, 2:04 p.m. UTC | #2
On 3/12/2018 4:59 PM, Alberto Garcia wrote:
> On Mon 03 Dec 2018 11:14:59 AM CET, Anton Nefedov wrote:
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -3004,6 +3004,8 @@
>>   #
>>   # @cor_write: a write due to copy-on-read (since 2.11)
>>   #
>> +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
> 
> Since 4.0 ??
> 
> Berto
> 

I heard that 3.1 will be followed by 4.0

/Anton
Vladimir Sementsov-Ogievskiy Dec. 5, 2018, 2:01 p.m. UTC | #3
03.12.2018 13:14, Anton Nefedov wrote:
> If COW areas of the newly allocated clusters are zeroes on the backing image,
> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
> cluster instead of writing explicit zero buffers later in perform_cow().
> 
> iotest 060:
> write to the discarded cluster does not trigger COW anymore.
> Use a backing image instead.
> 
> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
> Reviewed-by: Alberto Garcia <berto@igalia.com>
> ---
>   qapi/block-core.json       |  4 +-
>   block/qcow2.h              |  6 +++
>   block/qcow2-cluster.c      |  2 +-
>   block/qcow2.c              | 80 +++++++++++++++++++++++++++++++++++++-
>   block/trace-events         |  1 +
>   tests/qemu-iotests/060     | 26 ++++++++-----
>   tests/qemu-iotests/060.out |  5 ++-
>   7 files changed, 109 insertions(+), 15 deletions(-)
> 
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index d4fe710836..50598aa8fe 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -3004,6 +3004,8 @@
>   #
>   # @cor_write: a write due to copy-on-read (since 2.11)
>   #
> +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
> +#
>   # Since: 2.9
>   ##
>   { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
> @@ -3022,7 +3024,7 @@
>               'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>               'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>               'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
> -            'cor_write'] }
> +            'cor_write', 'cluster_alloc_space'] }
>   
>   ##
>   # @BlkdebugInjectErrorOptions:
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 8662b68575..8a64077897 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -389,6 +389,12 @@ typedef struct QCowL2Meta
>        */
>       Qcow2COWRegion cow_end;
>   
> +    /**
> +     * Indicates that COW regions are already handled and do not require
> +     * any more processing.
> +     */
> +    bool skip_cow;
> +
>       /**
>        * The I/O vector with the data from the actual guest write request.
>        * If non-NULL, this is meant to be merged together with the data
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index d37fe08b3d..3685c5f67e 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -806,7 +806,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
>       assert(start->offset + start->nb_bytes <= end->offset);
>       assert(!m->data_qiov || m->data_qiov->size == data_bytes);
>   
> -    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
> +    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
>           return 0;
>       }
>   
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 991d6ac91b..027188a1a3 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2015,6 +2015,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>               continue;
>           }
>   
> +        /* If COW regions are handled already, skip this too */
> +        if (m->skip_cow) {
> +            continue;
> +        }
> +
>           /* The data (middle) region must be immediately after the
>            * start region */
>           if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
> @@ -2040,6 +2045,68 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>       return false;
>   }
>   
> +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
> +{
> +    int64_t nr;
> +    return !bytes ||
> +        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);

bdrv_is_allocated_above may return error < 0

> +}
> +
> +static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
> +{
> +    /* This check is designed for optimization shortcut so it must be
> +     * efficient.
> +     * Instead of is_zero(), use is_unallocated() as it is faster (but not
> +     * as accurate and can result in false negatives). */
> +    return is_unallocated(bs, m->offset + m->cow_start.offset,
> +                          m->cow_start.nb_bytes) &&
> +           is_unallocated(bs, m->offset + m->cow_end.offset,
> +                          m->cow_end.nb_bytes);
> +}
> +
> +static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    QCowL2Meta *m;
> +
> +    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
> +        return 0;
> +    }
> +
> +    if (bs->encrypted) {
> +        return 0;
> +    }
> +
> +    for (m = l2meta; m != NULL; m = m->next) {
> +        int ret;
> +
> +        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
> +            continue;
> +        }
> +
> +        if (!is_zero_cow(bs, m)) {
> +            continue;
> +        }
> +
> +        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
> +        /* instead of writing zero COW buffers,
> +           efficiently zero out the whole clusters */
> +        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
> +                                    m->nb_clusters * s->cluster_size,
> +                                    BDRV_REQ_ALLOCATE);
> +        if (ret < 0) {
> +            if (ret != -ENOTSUP && ret != -EAGAIN) {

why do you ignore ENOTSUP?

And also, hmm, should not we have problems with serializing? If already started unaligned request,
it is serializing.. And now, we are going to start nested serializing request, which will definitely
return EAGAIN..

> +                return ret;
> +            }
> +            continue;
> +        }
> +
> +        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
> +        m->skip_cow = true;
> +    }
> +    return 0;
> +}
> +
>   static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
>                                            uint64_t bytes, QEMUIOVector *qiov,
>                                            int flags)
> @@ -2122,24 +2189,33 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
>               goto fail;
>           }
>   
> +        qemu_co_mutex_unlock(&s->lock);
> +
> +        ret = handle_alloc_space(bs, l2meta);
> +        if (ret < 0) {
> +            qemu_co_mutex_lock(&s->lock);
> +            goto fail;
> +        }
> +
>           /* If we need to do COW, check if it's possible to merge the
>            * writing of the guest data together with that of the COW regions.
>            * If it's not possible (or not necessary) then write the
>            * guest data now. */
>           if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
> -            qemu_co_mutex_unlock(&s->lock);
>               BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
>               trace_qcow2_writev_data(qemu_coroutine_self(),
>                                       cluster_offset + offset_in_cluster);
>               ret = bdrv_co_pwritev(bs->file,
>                                     cluster_offset + offset_in_cluster,
>                                     cur_bytes, &hd_qiov, 0);
> -            qemu_co_mutex_lock(&s->lock);
>               if (ret < 0) {
> +                qemu_co_mutex_lock(&s->lock);
>                   goto fail;
>               }
>           }
>   
> +        qemu_co_mutex_lock(&s->lock);
> +
>           ret = qcow2_handle_l2meta(bs, &l2meta, true);
>           if (ret) {
>               goto fail;
> diff --git a/block/trace-events b/block/trace-events
> index 3e8c47bb24..f3f6d66dcc 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -69,6 +69,7 @@ qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d"
>   qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64
>   qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d"
>   qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d"
> +qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d"
>   
>   # block/qcow2-cluster.c
>   qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
> diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060
> index af0588ae9a..089df94657 100755
> --- a/tests/qemu-iotests/060
> +++ b/tests/qemu-iotests/060
> @@ -150,27 +150,33 @@ $QEMU_IO -c "$OPEN_RO" -c "read -P 1 0 512" | _filter_qemu_io
>   echo
>   echo "=== Testing overlap while COW is in flight ==="
>   echo
> +BACKING_IMG=$TEST_IMG.base
> +TEST_IMG=$BACKING_IMG _make_test_img 1G
> +
> +$QEMU_IO -c 'write 64k 64k' "$BACKING_IMG" | _filter_qemu_io
> +
>   # compat=0.10 is required in order to make the following discard actually
> -# unallocate the sector rather than make it a zero sector - we want COW, after
> -# all.
> -IMGOPTS='compat=0.10' _make_test_img 1G
> +# unallocate the sector rather than make it a zero sector as we would like
> +# to reuse it for another guest offset
> +IMGOPTS='compat=0.10' _make_test_img -b "$BACKING_IMG" 1G
>   # Write two clusters, the second one enforces creation of an L2 table after
>   # the first data cluster.
>   $QEMU_IO -c 'write 0k 64k' -c 'write 512M 64k' "$TEST_IMG" | _filter_qemu_io
> -# Discard the first cluster. This cluster will soon enough be reallocated and
> -# used for COW.
> +# Discard the first cluster. This cluster will soon enough be reallocated
>   $QEMU_IO -c 'discard 0k 64k' "$TEST_IMG" | _filter_qemu_io
>   # Now, corrupt the image by marking the second L2 table cluster as free.
>   poke_file "$TEST_IMG" '131084' "\x00\x00" # 0x2000c
> -# Start a write operation requiring COW on the image stopping it right before
> -# doing the read; then, trigger the corruption prevention by writing anything to
> -# any unallocated cluster, leading to an attempt to overwrite the second L2
> +# Start a write operation requiring COW on the image;
> +# this write will reuse the host offset released by a previous discard.
> +# Stop it right before doing the read.
> +# Then, trigger the corruption prevention by writing anything to
> +# another unallocated cluster, leading to an attempt to overwrite the second L2
>   # table. Finally, resume the COW write and see it fail (but not crash).
>   echo "open -o file.driver=blkdebug $TEST_IMG
>   break cow_read 0
> -aio_write 0k 1k
> +aio_write 64k 1k
>   wait_break 0
> -write 64k 64k
> +write 128k 64k
>   resume 0" | $QEMU_IO | _filter_qemu_io
>   
>   echo
> diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out
> index d67c6234a4..ec8f15d2f0 100644
> --- a/tests/qemu-iotests/060.out
> +++ b/tests/qemu-iotests/060.out
> @@ -97,7 +97,10 @@ read 512/512 bytes at offset 0
>   
>   === Testing overlap while COW is in flight ===
>   
> -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
> +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1073741824
> +wrote 65536/65536 bytes at offset 65536
> +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 backing_file=TEST_DIR/t.IMGFMT.base
>   wrote 65536/65536 bytes at offset 0
>   64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>   wrote 65536/65536 bytes at offset 536870912
>
Anton Nefedov Dec. 5, 2018, 4:59 p.m. UTC | #4
On 5/12/2018 5:01 PM, Vladimir Sementsov-Ogievskiy wrote:
> 03.12.2018 13:14, Anton Nefedov wrote:
>> If COW areas of the newly allocated clusters are zeroes on the backing image,
>> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
>> cluster instead of writing explicit zero buffers later in perform_cow().
>>
>> iotest 060:
>> write to the discarded cluster does not trigger COW anymore.
>> Use a backing image instead.
>>
>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
>> Reviewed-by: Alberto Garcia <berto@igalia.com>
>> ---
>>    qapi/block-core.json       |  4 +-
>>    block/qcow2.h              |  6 +++
>>    block/qcow2-cluster.c      |  2 +-
>>    block/qcow2.c              | 80 +++++++++++++++++++++++++++++++++++++-
>>    block/trace-events         |  1 +
>>    tests/qemu-iotests/060     | 26 ++++++++-----
>>    tests/qemu-iotests/060.out |  5 ++-
>>    7 files changed, 109 insertions(+), 15 deletions(-)
>>
>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>> index d4fe710836..50598aa8fe 100644
>> --- a/qapi/block-core.json
>> +++ b/qapi/block-core.json
>> @@ -3004,6 +3004,8 @@
>>    #
>>    # @cor_write: a write due to copy-on-read (since 2.11)
>>    #
>> +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
>> +#
>>    # Since: 2.9
>>    ##
>>    { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
>> @@ -3022,7 +3024,7 @@
>>                'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>>                'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>>                'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
>> -            'cor_write'] }
>> +            'cor_write', 'cluster_alloc_space'] }
>>    
>>    ##
>>    # @BlkdebugInjectErrorOptions:
>> diff --git a/block/qcow2.h b/block/qcow2.h
>> index 8662b68575..8a64077897 100644
>> --- a/block/qcow2.h
>> +++ b/block/qcow2.h
>> @@ -389,6 +389,12 @@ typedef struct QCowL2Meta
>>         */
>>        Qcow2COWRegion cow_end;
>>    
>> +    /**
>> +     * Indicates that COW regions are already handled and do not require
>> +     * any more processing.
>> +     */
>> +    bool skip_cow;
>> +
>>        /**
>>         * The I/O vector with the data from the actual guest write request.
>>         * If non-NULL, this is meant to be merged together with the data
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index d37fe08b3d..3685c5f67e 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -806,7 +806,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
>>        assert(start->offset + start->nb_bytes <= end->offset);
>>        assert(!m->data_qiov || m->data_qiov->size == data_bytes);
>>    
>> -    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
>> +    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
>>            return 0;
>>        }
>>    
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index 991d6ac91b..027188a1a3 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -2015,6 +2015,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>                continue;
>>            }
>>    
>> +        /* If COW regions are handled already, skip this too */
>> +        if (m->skip_cow) {
>> +            continue;
>> +        }
>> +
>>            /* The data (middle) region must be immediately after the
>>             * start region */
>>            if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
>> @@ -2040,6 +2045,68 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>        return false;
>>    }
>>    
>> +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
>> +{
>> +    int64_t nr;
>> +    return !bytes ||
>> +        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);
> 
> bdrv_is_allocated_above may return error < 0
> 

Probably I just took is_zero() as an example.
But somewhere there's even a rationale (bdrv_co_do_copy_on_readv):

         ret = bdrv_is_allocated(bs, cluster_offset,
                                 MIN(cluster_bytes, max_transfer), &pnum);
         if (ret < 0) {
             /* Safe to treat errors in querying allocation as if
              * unallocated; we'll probably fail again soon on the
              * read, but at least that will set a decent errno.
              */
             pnum = MIN(cluster_bytes, max_transfer);
         }

>> +}
>> +
>> +static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
>> +{
>> +    /* This check is designed for optimization shortcut so it must be
>> +     * efficient.
>> +     * Instead of is_zero(), use is_unallocated() as it is faster (but not
>> +     * as accurate and can result in false negatives). */
>> +    return is_unallocated(bs, m->offset + m->cow_start.offset,
>> +                          m->cow_start.nb_bytes) &&
>> +           is_unallocated(bs, m->offset + m->cow_end.offset,
>> +                          m->cow_end.nb_bytes);
>> +}
>> +
>> +static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    QCowL2Meta *m;
>> +
>> +    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
>> +        return 0;
>> +    }
>> +
>> +    if (bs->encrypted) {
>> +        return 0;
>> +    }
>> +
>> +    for (m = l2meta; m != NULL; m = m->next) {
>> +        int ret;
>> +
>> +        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
>> +            continue;
>> +        }
>> +
>> +        if (!is_zero_cow(bs, m)) {
>> +            continue;
>> +        }
>> +
>> +        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
>> +        /* instead of writing zero COW buffers,
>> +           efficiently zero out the whole clusters */
>> +        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
>> +                                    m->nb_clusters * s->cluster_size,
>> +                                    BDRV_REQ_ALLOCATE);
>> +        if (ret < 0) {
>> +            if (ret != -ENOTSUP && ret != -EAGAIN) {
> 
> why do you ignore ENOTSUP?
> 

This might particularly happen after first pwrite_zeroes() to
file-posix.
It generally reports BDRV_REQ_ALLOCATE on open, but then it might reset
the flag if fallocate() returns ENOTSUP.

Nevermind file-posix; with any file driver what can we possibly do here:
this operation is non-mandatory and the error is not critical.
Never trying again doesn't look quite right, the file driver can reset
BDRV_REQ_ALLOCATE if it likes and we won't bother it anymore.


> And also, hmm, should not we have problems with serializing? If already started unaligned request,
> it is serializing.. And now, we are going to start nested serializing request, which will definitely
> return EAGAIN..
> 

The request we're processing here is on qcow2 level yet, it won't affect
the bs->file serialization queue.

[..]
Vladimir Sementsov-Ogievskiy Dec. 5, 2018, 5:42 p.m. UTC | #5
05.12.2018 19:59, Anton Nefedov wrote:
> On 5/12/2018 5:01 PM, Vladimir Sementsov-Ogievskiy wrote:
>> 03.12.2018 13:14, Anton Nefedov wrote:
>>> If COW areas of the newly allocated clusters are zeroes on the backing image,
>>> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
>>> cluster instead of writing explicit zero buffers later in perform_cow().
>>>
>>> iotest 060:
>>> write to the discarded cluster does not trigger COW anymore.
>>> Use a backing image instead.
>>>
>>> Signed-off-by: Anton Nefedov <anton.nefedov@virtuozzo.com>
>>> Reviewed-by: Alberto Garcia <berto@igalia.com>
>>> ---
>>>     qapi/block-core.json       |  4 +-
>>>     block/qcow2.h              |  6 +++
>>>     block/qcow2-cluster.c      |  2 +-
>>>     block/qcow2.c              | 80 +++++++++++++++++++++++++++++++++++++-
>>>     block/trace-events         |  1 +
>>>     tests/qemu-iotests/060     | 26 ++++++++-----
>>>     tests/qemu-iotests/060.out |  5 ++-
>>>     7 files changed, 109 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/qapi/block-core.json b/qapi/block-core.json
>>> index d4fe710836..50598aa8fe 100644
>>> --- a/qapi/block-core.json
>>> +++ b/qapi/block-core.json
>>> @@ -3004,6 +3004,8 @@
>>>     #
>>>     # @cor_write: a write due to copy-on-read (since 2.11)
>>>     #
>>> +# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
>>> +#
>>>     # Since: 2.9
>>>     ##
>>>     { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
>>> @@ -3022,7 +3024,7 @@
>>>                 'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
>>>                 'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
>>>                 'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
>>> -            'cor_write'] }
>>> +            'cor_write', 'cluster_alloc_space'] }
>>>     
>>>     ##
>>>     # @BlkdebugInjectErrorOptions:
>>> diff --git a/block/qcow2.h b/block/qcow2.h
>>> index 8662b68575..8a64077897 100644
>>> --- a/block/qcow2.h
>>> +++ b/block/qcow2.h
>>> @@ -389,6 +389,12 @@ typedef struct QCowL2Meta
>>>          */
>>>         Qcow2COWRegion cow_end;
>>>     
>>> +    /**
>>> +     * Indicates that COW regions are already handled and do not require
>>> +     * any more processing.
>>> +     */
>>> +    bool skip_cow;
>>> +
>>>         /**
>>>          * The I/O vector with the data from the actual guest write request.
>>>          * If non-NULL, this is meant to be merged together with the data
>>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>>> index d37fe08b3d..3685c5f67e 100644
>>> --- a/block/qcow2-cluster.c
>>> +++ b/block/qcow2-cluster.c
>>> @@ -806,7 +806,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
>>>         assert(start->offset + start->nb_bytes <= end->offset);
>>>         assert(!m->data_qiov || m->data_qiov->size == data_bytes);
>>>     
>>> -    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
>>> +    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
>>>             return 0;
>>>         }
>>>     
>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>> index 991d6ac91b..027188a1a3 100644
>>> --- a/block/qcow2.c
>>> +++ b/block/qcow2.c
>>> @@ -2015,6 +2015,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>>                 continue;
>>>             }
>>>     
>>> +        /* If COW regions are handled already, skip this too */
>>> +        if (m->skip_cow) {
>>> +            continue;
>>> +        }
>>> +
>>>             /* The data (middle) region must be immediately after the
>>>              * start region */
>>>             if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
>>> @@ -2040,6 +2045,68 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>>         return false;
>>>     }
>>>     
>>> +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
>>> +{
>>> +    int64_t nr;
>>> +    return !bytes ||
>>> +        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);
>>
>> bdrv_is_allocated_above may return error < 0
>>
> 
> Probably I just took is_zero() as an example.
> But somewhere there's even a rationale (bdrv_co_do_copy_on_readv):
> 
>           ret = bdrv_is_allocated(bs, cluster_offset,
>                                   MIN(cluster_bytes, max_transfer), &pnum);
>           if (ret < 0) {
>               /* Safe to treat errors in querying allocation as if
>                * unallocated; we'll probably fail again soon on the
>                * read, but at least that will set a decent errno.
>                */
>               pnum = MIN(cluster_bytes, max_transfer);
>           }

aha, anyway, !bdrv_is_allocated_above is true when and only when the function
successfully returned "unallocated".

ahaha, this rationale has funny mistake: s/unallocated/allocated )
Vladimir Sementsov-Ogievskiy Dec. 13, 2018, 12:02 p.m. UTC | #6
03.12.2018 13:14, Anton Nefedov wrote:
> If COW areas of the newly allocated clusters are zeroes on the backing image,
> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
> cluster instead of writing explicit zero buffers later in perform_cow().
> 

[...]

> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2015,6 +2015,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>               continue;
>           }
>   
> +        /* If COW regions are handled already, skip this too */
> +        if (m->skip_cow) {
> +            continue;
> +        }
> +
>           /* The data (middle) region must be immediately after the
>            * start region */
>           if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
> @@ -2040,6 +2045,68 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>       return false;
>   }
>   
> +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
> +{
> +    int64_t nr;
> +    return !bytes ||
> +        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);

hm, nr may be < bytes if it is up to file length. And we lose this case, when, it
may be considered as unallocated too.

Doesn't harm, however.

> +}
> +
> +static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
> +{
> +    /* This check is designed for optimization shortcut so it must be
> +     * efficient.
> +     * Instead of is_zero(), use is_unallocated() as it is faster (but not
> +     * as accurate and can result in false negatives). */

But, in case of allocated zeros, we'll read them anyway (as part of COW process),
so, it may be handled in the same way too. May be not here, but after read we can
check for zeroes, and again effectively write zeros to the whole cluster.

Again it may be done separately, I don't sure it worth doing.

> +    return is_unallocated(bs, m->offset + m->cow_start.offset,
> +                          m->cow_start.nb_bytes) &&
> +           is_unallocated(bs, m->offset + m->cow_end.offset,
> +                          m->cow_end.nb_bytes);
> +}
> +
> +static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    QCowL2Meta *m;
> +
> +    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
> +        return 0;
> +    }
> +
> +    if (bs->encrypted) {
> +        return 0;
> +    }
> +
> +    for (m = l2meta; m != NULL; m = m->next) {
> +        int ret;
> +
> +        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
> +            continue;
> +        }
> +
> +        if (!is_zero_cow(bs, m)) {
> +            continue;
> +        }

pre_write_overlap_check should be here

> +
> +        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
> +        /* instead of writing zero COW buffers,
> +           efficiently zero out the whole clusters */
> +        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
> +                                    m->nb_clusters * s->cluster_size,
> +                                    BDRV_REQ_ALLOCATE);
> +        if (ret < 0) {
> +            if (ret != -ENOTSUP && ret != -EAGAIN) {
> +                return ret;
> +            }
> +            continue;
> +        }
> +
> +        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
> +        m->skip_cow = true;
> +    }
> +    return 0;
> +}
> +
Anton Nefedov Dec. 13, 2018, 1:57 p.m. UTC | #7
On 13/12/2018 3:02 PM, Vladimir Sementsov-Ogievskiy wrote:
> 03.12.2018 13:14, Anton Nefedov wrote:
>> If COW areas of the newly allocated clusters are zeroes on the backing image,
>> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
>> cluster instead of writing explicit zero buffers later in perform_cow().
>>
> 
> [...]
> 
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -2015,6 +2015,11 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>                continue;
>>            }
>>    
>> +        /* If COW regions are handled already, skip this too */
>> +        if (m->skip_cow) {
>> +            continue;
>> +        }
>> +
>>            /* The data (middle) region must be immediately after the
>>             * start region */
>>            if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
>> @@ -2040,6 +2045,68 @@ static bool merge_cow(uint64_t offset, unsigned bytes,
>>        return false;
>>    }
>>    
>> +static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
>> +{
>> +    int64_t nr;
>> +    return !bytes ||
>> +        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);
> 
> hm, nr may be < bytes if it is up to file length. And we lose this case, when, it
> may be considered as unallocated too.
> 
> Doesn't harm, however.
> 

Thanks guys for your review.

This case I think is too rare to care about.

>> +}
>> +
>> +static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
>> +{
>> +    /* This check is designed for optimization shortcut so it must be
>> +     * efficient.
>> +     * Instead of is_zero(), use is_unallocated() as it is faster (but not
>> +     * as accurate and can result in false negatives). */
> 
> But, in case of allocated zeros, we'll read them anyway (as part of COW process),
> so, it may be handled in the same way too. May be not here, but after read we can
> check for zeroes, and again effectively write zeros to the whole cluster.
> 
> Again it may be done separately, I don't sure it worth doing.
> 

Detecting zeroes after we read them (and not here) might make sense;
performance gain should be about the same (minus some CPU to check the
read buffer for zeroes).

The question is how frequent it might hit:
  - raw backing image with holes
  - qcow2 backing image with sub-cluster zero areas
    (e.g. smaller cluster size)
  - backing image contains lots of space with explicit zeroes
    (e.g. guest FS with 'shred' extensively used to delete files)

None of these probably occur that frequent.
But might be a next step and a separate series.

>> +    return is_unallocated(bs, m->offset + m->cow_start.offset,
>> +                          m->cow_start.nb_bytes) &&
>> +           is_unallocated(bs, m->offset + m->cow_end.offset,
>> +                          m->cow_end.nb_bytes);
>> +}
>> +
>> +static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    QCowL2Meta *m;
>> +
>> +    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
>> +        return 0;
>> +    }
>> +
>> +    if (bs->encrypted) {
>> +        return 0;
>> +    }
>> +
>> +    for (m = l2meta; m != NULL; m = m->next) {
>> +        int ret;
>> +
>> +        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
>> +            continue;
>> +        }
>> +
>> +        if (!is_zero_cow(bs, m)) {
>> +            continue;
>> +        }
> 
> pre_write_overlap_check should be here
> 

Existing pre_write_overlap_check in qcow2_co_pwritev() should cover it,
since the check aligns the range to cluster boundaries.

However I think I missed a comment about it here. For suspicious
readers :) and just in case someone starts moving this code around.

I propose:

diff --git a/block/qcow2.c b/block/qcow2.c
index 027188a1a3..b3b3124083 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2088,6 +2088,9 @@ static int handle_alloc_space(BlockDriverState 
*bs, QCowL2Meta *l2meta)
              continue;
          }

+        /* Conventional place for qcow2_pre_write_overlap_check() but 
in this
+           case it is already done for these clusters */
+
          BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
          /* instead of writing zero COW buffers,
             efficiently zero out the whole clusters */


>> +
>> +        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
>> +        /* instead of writing zero COW buffers,
>> +           efficiently zero out the whole clusters */
>> +        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
>> +                                    m->nb_clusters * s->cluster_size,
>> +                                    BDRV_REQ_ALLOCATE);
>> +        if (ret < 0) {
>> +            if (ret != -ENOTSUP && ret != -EAGAIN) {
>> +                return ret;
>> +            }
>> +            continue;
>> +        }
>> +
>> +        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
>> +        m->skip_cow = true;
>> +    }
>> +    return 0;
>> +}
>> +
> 
>
Vladimir Sementsov-Ogievskiy Dec. 14, 2018, 4:20 p.m. UTC | #8
03.12.2018 13:14, Anton Nefedov wrote:
> If COW areas of the newly allocated clusters are zeroes on the backing image,
> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
> cluster instead of writing explicit zero buffers later in perform_cow().
> 
> iotest 060:
> write to the discarded cluster does not trigger COW anymore.
> Use a backing image instead.
> 

[..]

> --- a/tests/qemu-iotests/060
> +++ b/tests/qemu-iotests/060
> @@ -150,27 +150,33 @@ $QEMU_IO -c "$OPEN_RO" -c "read -P 1 0 512" | _filter_qemu_io
>   echo
>   echo "=== Testing overlap while COW is in flight ==="
>   echo
> +BACKING_IMG=$TEST_IMG.base
> +TEST_IMG=$BACKING_IMG _make_test_img 1G
> +
> +$QEMU_IO -c 'write 64k 64k' "$BACKING_IMG" | _filter_qemu_io
> +
>   # compat=0.10 is required in order to make the following discard actually
> -# unallocate the sector rather than make it a zero sector - we want COW, after
> -# all.
> -IMGOPTS='compat=0.10' _make_test_img 1G
> +# unallocate the sector rather than make it a zero sector as we would like
> +# to reuse it for another guest offset
> +IMGOPTS='compat=0.10' _make_test_img -b "$BACKING_IMG" 1G
>   # Write two clusters, the second one enforces creation of an L2 table after
>   # the first data cluster.
>   $QEMU_IO -c 'write 0k 64k' -c 'write 512M 64k' "$TEST_IMG" | _filter_qemu_io
> -# Discard the first cluster. This cluster will soon enough be reallocated and
> -# used for COW.
> +# Discard the first cluster. This cluster will soon enough be reallocated
>   $QEMU_IO -c 'discard 0k 64k' "$TEST_IMG" | _filter_qemu_io
>   # Now, corrupt the image by marking the second L2 table cluster as free.
>   poke_file "$TEST_IMG" '131084' "\x00\x00" # 0x2000c
> -# Start a write operation requiring COW on the image stopping it right before
> -# doing the read; then, trigger the corruption prevention by writing anything to
> -# any unallocated cluster, leading to an attempt to overwrite the second L2
> +# Start a write operation requiring COW on the image;
> +# this write will reuse the host offset released by a previous discard.
> +# Stop it right before doing the read.
> +# Then, trigger the corruption prevention by writing anything to
> +# another unallocated cluster, leading to an attempt to overwrite the second L2
>   # table. Finally, resume the COW write and see it fail (but not crash).
>   echo "open -o file.driver=blkdebug $TEST_IMG
>   break cow_read 0
> -aio_write 0k 1k
> +aio_write 64k 1k
>   wait_break 0
> -write 64k 64k
> +write 128k 64k

don't understand why you need these changes.

works for me, without them, if write to backing at 0 offset, of course.

As I understand, discard create unallocated holes in top qcow2 for old qcow2 version.

>   resume 0" | $QEMU_IO | _filter_qemu_io
>   
>   echo
> diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out
> index d67c6234a4..ec8f15d2f0 100644
> --- a/tests/qemu-iotests/060.out
> +++ b/tests/qemu-iotests/060.out
> @@ -97,7 +97,10 @@ read 512/512 bytes at offset 0
>   
>   === Testing overlap while COW is in flight ===
>   
> -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
> +Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1073741824
> +wrote 65536/65536 bytes at offset 65536
> +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
> +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 backing_file=TEST_DIR/t.IMGFMT.base
>   wrote 65536/65536 bytes at offset 0
>   64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>   wrote 65536/65536 bytes at offset 536870912
>
Anton Nefedov Dec. 17, 2018, 10:17 a.m. UTC | #9
On 14/12/2018 7:20 PM, Vladimir Sementsov-Ogievskiy wrote:
> 03.12.2018 13:14, Anton Nefedov wrote:
>> If COW areas of the newly allocated clusters are zeroes on the backing image,
>> efficient bdrv_write_zeroes(flags=BDRV_REQ_ALLOCATE) can be used on the whole
>> cluster instead of writing explicit zero buffers later in perform_cow().
>>
>> iotest 060:
>> write to the discarded cluster does not trigger COW anymore.
>> Use a backing image instead.
>>
> 
> [..]
> 
>> --- a/tests/qemu-iotests/060
>> +++ b/tests/qemu-iotests/060
>> @@ -150,27 +150,33 @@ $QEMU_IO -c "$OPEN_RO" -c "read -P 1 0 512" | _filter_qemu_io
>>    echo
>>    echo "=== Testing overlap while COW is in flight ==="
>>    echo
>> +BACKING_IMG=$TEST_IMG.base
>> +TEST_IMG=$BACKING_IMG _make_test_img 1G
>> +
>> +$QEMU_IO -c 'write 64k 64k' "$BACKING_IMG" | _filter_qemu_io
>> +
>>    # compat=0.10 is required in order to make the following discard actually
>> -# unallocate the sector rather than make it a zero sector - we want COW, after
>> -# all.
>> -IMGOPTS='compat=0.10' _make_test_img 1G
>> +# unallocate the sector rather than make it a zero sector as we would like
>> +# to reuse it for another guest offset
>> +IMGOPTS='compat=0.10' _make_test_img -b "$BACKING_IMG" 1G
>>    # Write two clusters, the second one enforces creation of an L2 table after
>>    # the first data cluster.
>>    $QEMU_IO -c 'write 0k 64k' -c 'write 512M 64k' "$TEST_IMG" | _filter_qemu_io
>> -# Discard the first cluster. This cluster will soon enough be reallocated and
>> -# used for COW.
>> +# Discard the first cluster. This cluster will soon enough be reallocated
>>    $QEMU_IO -c 'discard 0k 64k' "$TEST_IMG" | _filter_qemu_io
>>    # Now, corrupt the image by marking the second L2 table cluster as free.
>>    poke_file "$TEST_IMG" '131084' "\x00\x00" # 0x2000c
>> -# Start a write operation requiring COW on the image stopping it right before
>> -# doing the read; then, trigger the corruption prevention by writing anything to
>> -# any unallocated cluster, leading to an attempt to overwrite the second L2
>> +# Start a write operation requiring COW on the image;
>> +# this write will reuse the host offset released by a previous discard.
>> +# Stop it right before doing the read.
>> +# Then, trigger the corruption prevention by writing anything to
>> +# another unallocated cluster, leading to an attempt to overwrite the second L2
>>    # table. Finally, resume the COW write and see it fail (but not crash).
>>    echo "open -o file.driver=blkdebug $TEST_IMG
>>    break cow_read 0
>> -aio_write 0k 1k
>> +aio_write 64k 1k
>>    wait_break 0
>> -write 64k 64k
>> +write 128k 64k
> 
> don't understand why you need these changes.
> 
> works for me, without them, if write to backing at 0 offset, of course.
> 
> As I understand, discard create unallocated holes in top qcow2 for old qcow2 version.
> 

Ok, so COW happens regardless if this guest offset has been discarded
before. These offset changes are indeed not needed. Just the backing
file.
diff mbox series

Patch

diff --git a/qapi/block-core.json b/qapi/block-core.json
index d4fe710836..50598aa8fe 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3004,6 +3004,8 @@ 
 #
 # @cor_write: a write due to copy-on-read (since 2.11)
 #
+# @cluster_alloc_space: an allocation of file space for a cluster (since 4.0)
+#
 # Since: 2.9
 ##
 { 'enum': 'BlkdebugEvent', 'prefix': 'BLKDBG',
@@ -3022,7 +3024,7 @@ 
             'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
             'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
             'l1_shrink_write_table', 'l1_shrink_free_l2_clusters',
-            'cor_write'] }
+            'cor_write', 'cluster_alloc_space'] }
 
 ##
 # @BlkdebugInjectErrorOptions:
diff --git a/block/qcow2.h b/block/qcow2.h
index 8662b68575..8a64077897 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -389,6 +389,12 @@  typedef struct QCowL2Meta
      */
     Qcow2COWRegion cow_end;
 
+    /**
+     * Indicates that COW regions are already handled and do not require
+     * any more processing.
+     */
+    bool skip_cow;
+
     /**
      * The I/O vector with the data from the actual guest write request.
      * If non-NULL, this is meant to be merged together with the data
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d37fe08b3d..3685c5f67e 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -806,7 +806,7 @@  static int perform_cow(BlockDriverState *bs, QCowL2Meta *m)
     assert(start->offset + start->nb_bytes <= end->offset);
     assert(!m->data_qiov || m->data_qiov->size == data_bytes);
 
-    if (start->nb_bytes == 0 && end->nb_bytes == 0) {
+    if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) {
         return 0;
     }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 991d6ac91b..027188a1a3 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2015,6 +2015,11 @@  static bool merge_cow(uint64_t offset, unsigned bytes,
             continue;
         }
 
+        /* If COW regions are handled already, skip this too */
+        if (m->skip_cow) {
+            continue;
+        }
+
         /* The data (middle) region must be immediately after the
          * start region */
         if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
@@ -2040,6 +2045,68 @@  static bool merge_cow(uint64_t offset, unsigned bytes,
     return false;
 }
 
+static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t bytes)
+{
+    int64_t nr;
+    return !bytes ||
+        (!bdrv_is_allocated_above(bs, NULL, offset, bytes, &nr) && nr == bytes);
+}
+
+static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
+{
+    /* This check is designed for optimization shortcut so it must be
+     * efficient.
+     * Instead of is_zero(), use is_unallocated() as it is faster (but not
+     * as accurate and can result in false negatives). */
+    return is_unallocated(bs, m->offset + m->cow_start.offset,
+                          m->cow_start.nb_bytes) &&
+           is_unallocated(bs, m->offset + m->cow_end.offset,
+                          m->cow_end.nb_bytes);
+}
+
+static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
+{
+    BDRVQcow2State *s = bs->opaque;
+    QCowL2Meta *m;
+
+    if (!(bs->file->bs->supported_zero_flags & BDRV_REQ_ALLOCATE)) {
+        return 0;
+    }
+
+    if (bs->encrypted) {
+        return 0;
+    }
+
+    for (m = l2meta; m != NULL; m = m->next) {
+        int ret;
+
+        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
+            continue;
+        }
+
+        if (!is_zero_cow(bs, m)) {
+            continue;
+        }
+
+        BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
+        /* instead of writing zero COW buffers,
+           efficiently zero out the whole clusters */
+        ret = bdrv_co_pwrite_zeroes(bs->file, m->alloc_offset,
+                                    m->nb_clusters * s->cluster_size,
+                                    BDRV_REQ_ALLOCATE);
+        if (ret < 0) {
+            if (ret != -ENOTSUP && ret != -EAGAIN) {
+                return ret;
+            }
+            continue;
+        }
+
+        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
+        m->skip_cow = true;
+    }
+    return 0;
+}
+
 static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                          uint64_t bytes, QEMUIOVector *qiov,
                                          int flags)
@@ -2122,24 +2189,33 @@  static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
             goto fail;
         }
 
+        qemu_co_mutex_unlock(&s->lock);
+
+        ret = handle_alloc_space(bs, l2meta);
+        if (ret < 0) {
+            qemu_co_mutex_lock(&s->lock);
+            goto fail;
+        }
+
         /* If we need to do COW, check if it's possible to merge the
          * writing of the guest data together with that of the COW regions.
          * If it's not possible (or not necessary) then write the
          * guest data now. */
         if (!merge_cow(offset, cur_bytes, &hd_qiov, l2meta)) {
-            qemu_co_mutex_unlock(&s->lock);
             BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
             trace_qcow2_writev_data(qemu_coroutine_self(),
                                     cluster_offset + offset_in_cluster);
             ret = bdrv_co_pwritev(bs->file,
                                   cluster_offset + offset_in_cluster,
                                   cur_bytes, &hd_qiov, 0);
-            qemu_co_mutex_lock(&s->lock);
             if (ret < 0) {
+                qemu_co_mutex_lock(&s->lock);
                 goto fail;
             }
         }
 
+        qemu_co_mutex_lock(&s->lock);
+
         ret = qcow2_handle_l2meta(bs, &l2meta, true);
         if (ret) {
             goto fail;
diff --git a/block/trace-events b/block/trace-events
index 3e8c47bb24..f3f6d66dcc 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -69,6 +69,7 @@  qcow2_writev_done_part(void *co, int cur_bytes) "co %p cur_bytes %d"
 qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64
 qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d"
 qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset 0x%" PRIx64 " count %d"
+qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset 0x%" PRIx64 " nb_clusters %d"
 
 # block/qcow2-cluster.c
 qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p offset 0x%" PRIx64 " bytes %d"
diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060
index af0588ae9a..089df94657 100755
--- a/tests/qemu-iotests/060
+++ b/tests/qemu-iotests/060
@@ -150,27 +150,33 @@  $QEMU_IO -c "$OPEN_RO" -c "read -P 1 0 512" | _filter_qemu_io
 echo
 echo "=== Testing overlap while COW is in flight ==="
 echo
+BACKING_IMG=$TEST_IMG.base
+TEST_IMG=$BACKING_IMG _make_test_img 1G
+
+$QEMU_IO -c 'write 64k 64k' "$BACKING_IMG" | _filter_qemu_io
+
 # compat=0.10 is required in order to make the following discard actually
-# unallocate the sector rather than make it a zero sector - we want COW, after
-# all.
-IMGOPTS='compat=0.10' _make_test_img 1G
+# unallocate the sector rather than make it a zero sector as we would like
+# to reuse it for another guest offset
+IMGOPTS='compat=0.10' _make_test_img -b "$BACKING_IMG" 1G
 # Write two clusters, the second one enforces creation of an L2 table after
 # the first data cluster.
 $QEMU_IO -c 'write 0k 64k' -c 'write 512M 64k' "$TEST_IMG" | _filter_qemu_io
-# Discard the first cluster. This cluster will soon enough be reallocated and
-# used for COW.
+# Discard the first cluster. This cluster will soon enough be reallocated
 $QEMU_IO -c 'discard 0k 64k' "$TEST_IMG" | _filter_qemu_io
 # Now, corrupt the image by marking the second L2 table cluster as free.
 poke_file "$TEST_IMG" '131084' "\x00\x00" # 0x2000c
-# Start a write operation requiring COW on the image stopping it right before
-# doing the read; then, trigger the corruption prevention by writing anything to
-# any unallocated cluster, leading to an attempt to overwrite the second L2
+# Start a write operation requiring COW on the image;
+# this write will reuse the host offset released by a previous discard.
+# Stop it right before doing the read.
+# Then, trigger the corruption prevention by writing anything to
+# another unallocated cluster, leading to an attempt to overwrite the second L2
 # table. Finally, resume the COW write and see it fail (but not crash).
 echo "open -o file.driver=blkdebug $TEST_IMG
 break cow_read 0
-aio_write 0k 1k
+aio_write 64k 1k
 wait_break 0
-write 64k 64k
+write 128k 64k
 resume 0" | $QEMU_IO | _filter_qemu_io
 
 echo
diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out
index d67c6234a4..ec8f15d2f0 100644
--- a/tests/qemu-iotests/060.out
+++ b/tests/qemu-iotests/060.out
@@ -97,7 +97,10 @@  read 512/512 bytes at offset 0
 
 === Testing overlap while COW is in flight ===
 
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1073741824
+wrote 65536/65536 bytes at offset 65536
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 backing_file=TEST_DIR/t.IMGFMT.base
 wrote 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 wrote 65536/65536 bytes at offset 536870912