diff mbox

[v13,03/14] qcow2: Optimize bdrv_make_empty()

Message ID 1413982283-10186-4-git-send-email-mreitz@redhat.com
State New
Headers show

Commit Message

Max Reitz Oct. 22, 2014, 12:51 p.m. UTC
bdrv_make_empty() is currently only called if the current image
represents an external snapshot that has been committed to its base
image; it is therefore unlikely to have internal snapshots. In this
case, bdrv_make_empty() can be greatly sped up by emptying the L1 and
refcount table (while having the dirty flag set, which only works for
compat=1.1) and creating a trivial refcount structure.

If there are snapshots or for compat=0.10, fall back to the simple
implementation (discard all clusters).

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/blkdebug.c      |   2 +
 block/qcow2.c         | 143 ++++++++++++++++++++++++++++++++++++++++++++------
 include/block/block.h |   2 +
 3 files changed, 132 insertions(+), 15 deletions(-)

Comments

Eric Blake Oct. 22, 2014, 4:04 p.m. UTC | #1
On 10/22/2014 06:51 AM, Max Reitz wrote:
> bdrv_make_empty() is currently only called if the current image
> represents an external snapshot that has been committed to its base
> image; it is therefore unlikely to have internal snapshots. In this
> case, bdrv_make_empty() can be greatly sped up by emptying the L1 and
> refcount table (while having the dirty flag set, which only works for
> compat=1.1) and creating a trivial refcount structure.
> 
> If there are snapshots or for compat=0.10, fall back to the simple
> implementation (discard all clusters).
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/blkdebug.c      |   2 +
>  block/qcow2.c         | 143 ++++++++++++++++++++++++++++++++++++++++++++------
>  include/block/block.h |   2 +
>  3 files changed, 132 insertions(+), 15 deletions(-)

Reviewed-by: Eric Blake <eblake@redhat.com>
Kevin Wolf Oct. 22, 2014, 6:35 p.m. UTC | #2
Am 22.10.2014 um 14:51 hat Max Reitz geschrieben:
> bdrv_make_empty() is currently only called if the current image
> represents an external snapshot that has been committed to its base
> image; it is therefore unlikely to have internal snapshots. In this
> case, bdrv_make_empty() can be greatly sped up by emptying the L1 and
> refcount table (while having the dirty flag set, which only works for
> compat=1.1) and creating a trivial refcount structure.
> 
> If there are snapshots or for compat=0.10, fall back to the simple
> implementation (discard all clusters).
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>

Hey, this feels actually reviewable this time. :-)

> diff --git a/block/blkdebug.c b/block/blkdebug.c
> index e046b92..862d93b 100644
> --- a/block/blkdebug.c
> +++ b/block/blkdebug.c
> @@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
>      [BLKDBG_PWRITEV]                        = "pwritev",
>      [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
>      [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
> +
> +    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
>  };
>  
>  static int get_event_by_name(const char *name, BlkDebugEvent *event)
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 1ef3a5f..16dece2 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2232,24 +2232,137 @@ fail:
>  
>  static int qcow2_make_empty(BlockDriverState *bs)
>  {
> +    BDRVQcowState *s = bs->opaque;
>      int ret = 0;
> -    uint64_t start_sector;
> -    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
>  
> -    for (start_sector = 0; start_sector < bs->total_sectors;
> -         start_sector += sector_step)
> -    {
> -        /* As this function is generally used after committing an external
> -         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
> -         * default action for this kind of discard is to pass the discard,
> -         * which will ideally result in an actually smaller image file, as
> -         * is probably desired. */
> -        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
> -                                     MIN(sector_step,
> -                                         bs->total_sectors - start_sector),
> -                                     QCOW2_DISCARD_SNAPSHOT, true);
> +    if (s->snapshots || s->qcow_version < 3) {
> +        uint64_t start_sector;
> +        int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
> +
> +        /* If there are snapshots, every active cluster has to be discarded; and
> +         * because compat=0.10 does not support setting the dirty flag, we have
> +         * to use this fallback there as well */
> +
> +        for (start_sector = 0; start_sector < bs->total_sectors;
> +             start_sector += sector_step)
> +        {
> +            /* As this function is generally used after committing an external
> +             * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
> +             * default action for this kind of discard is to pass the discard,
> +             * which will ideally result in an actually smaller image file, as
> +             * is probably desired. */
> +            ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
> +                                         MIN(sector_step,
> +                                             bs->total_sectors - start_sector),
> +                                         QCOW2_DISCARD_SNAPSHOT, true);
> +            if (ret < 0) {
> +                break;
> +            }
> +        }

My first though was to add a return here, so the indentation level for
the rest is one less. Probably a matter of taste, though.

> +    } else {
> +        int l1_clusters;
> +        int64_t offset;
> +        uint64_t *new_reftable;
> +        uint64_t rt_entry;
> +        struct {
> +            uint64_t l1_offset;
> +            uint64_t reftable_offset;
> +            uint32_t reftable_clusters;
> +        } QEMU_PACKED l1_ofs_rt_ofs_cls;
> +
> +        ret = qcow2_cache_empty(bs, s->l2_table_cache);
>          if (ret < 0) {
> -            break;
> +            return ret;
> +        }
> +
> +        ret = qcow2_cache_empty(bs, s->refcount_block_cache);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        /* Refcounts will be broken utterly */
> +        ret = qcow2_mark_dirty(bs);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        l1_clusters = DIV_ROUND_UP(s->l1_size,
> +                                   s->cluster_size / sizeof(uint64_t));
> +        new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
> +        if (!new_reftable) {
> +            return -ENOMEM;
> +        }
> +
> +        BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);

Until here, the failure cases are trivially okay. The worst thing that
could happen is that the image is needlessly marked as dirty.

> +        /* Overwrite enough clusters at the beginning of the sectors to place
> +         * the refcount table, a refcount block and the L1 table in; this may
> +         * overwrite parts of the existing refcount and L1 table, which is not
> +         * an issue because the dirty flag is set, complete data loss is in fact
> +         * desired and partial data loss is consequently fine as well */
> +        ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
> +                                (2 + l1_clusters) * s->cluster_size /
> +                                BDRV_SECTOR_SIZE, 0);

If we crash at this point, we're _not_ okay any more. --verbose follows:

On disk, we may have overwritten a refcount table or a refcount block
with zeros. This is fine, we have the dirty flag set, so destroying any
refcounting structure does no harm.

We may also have overwritten an L1 or L2 table. As the commit correctly
explains, this is doing partially what the function is supposed to do
for the whole image. Affected data clusters are now read from the
backing file. Good.

However, we may also have overwritten data clusters that are still
accessible using an L1/L2 table that hasn't been hit by this write
operation. We're reading corrupted (zeroed out) data now instead of
going to the backing file. Bug!

In my original suggestion I had an item where the L1 table was zeroed
out first before the start of the image is zeroed. This would have
avoided the bug.

> +        if (ret < 0) {
> +            g_free(new_reftable);
> +            return ret;
> +        }

If we fail here (without crashing), the first clusters could be in their
original state or partially zeroed. Assuming that you fixed the above
bug, the on-disk state would be okay if we opened the image now because
the dirty flag would trigger an image repair; but we don't get the
repair when taking this failure path and we may have zeroed a refcount
table/block. This is probably a problem and we may have to make the BDS
unusable.

The in-memory state of the L1 table is hopefully zeroed out, so it's
consistent with what is on disk.

The in-memory state of the refcount table looks like it's not in sync
with the on-disk state. Note that while the dirty flag allows that the
on-disk state can be anything, the in-memory state is what we keep using
after a failure. The in-memory state isn't accurate at this point, but
we only create leaks. Lots of them, because we zeroed the L1 table, but
that's good enough. If refcounts are updated later, the old offsets
should still be valid.

All of the following code shouldn't change what a guest is seeing, but
only fix up the refcounts.

> +        BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
> +        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
> +
> +        /* "Create" an empty reftable (one cluster) directly after the image
> +         * header and an empty L1 table three clusters after the image header;
> +         * the cluster between those two will be used as the first refblock */
> +        cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
> +        cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
> +        cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
> +        ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
> +                               &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));

If we crash here, we have a new, but still an all-zero L1 table, which
ensures that the guest sees the right thing. We also get a new refcount
table, which is invalid anyway. It is completely zeroed out, image
repair should be able to cope with it.

> +        if (ret < 0) {
> +            g_free(new_reftable);
> +            return ret;
> +        }

Failure without a crash. It should be safe to assume that either the
whole header has been updated (and the flush failed) or it's in its old
state.

Probably the same problem as above with potentially overwritten refcount
blocks while we don't get a repair on this failure path. Without
accurate in-memory refcounts, we can't keep using the BDS.

> +        s->l1_table_offset = 3 * s->cluster_size;
> +        memset(s->l1_table, 0, s->l1_size * sizeof(uint64_t));

This line has been moved to above by the assumed L1 fix.

> +        s->refcount_table_offset = s->cluster_size;
> +        s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
> +
> +        g_free(s->refcount_table);
> +        s->refcount_table = new_reftable;
> +
> +        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
> +
> +        /* Enter the first refblock into the reftable */
> +        rt_entry = cpu_to_be64(2 * s->cluster_size);
> +        ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
> +                               &rt_entry, sizeof(rt_entry));

On-disk status: L1 table empty, refcount table has a refblock, but still
no references. Dirty flag ensures we're safe.

> +        if (ret < 0) {
> +            return ret;
> +        }

Failure path without crash: s->refcount_table_offset/size have been
updated, but we still don't have accurate in-memory information. Still
can't keep using the BDS after failing here.

> +        s->refcount_table[0] = 2 * s->cluster_size;
> +
> +        ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size);
> +        if (ret < 0) {
> +            return ret;
> +        }

Nothing interesting happens here (except that this is the heart of the
functionality :-)). The on-disk state didn't reference any of the
truncated clusters any more. The in-memory state isn't fixed yet.

> +        s->free_cluster_index = 0;
> +        offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size +
> +                                      s->l1_size * sizeof(uint64_t));
> +        if (offset < 0) {
> +            return offset;

BDS still unusable here.

> +        } else if (offset > 0) {
> +            error_report("First cluster in emptied image is in use");
> +            abort();
> +        }

Finally, at this point we have correct in-memory information again.

Not sure if it's worth the effort of trying to move this further up. We
could at least easily move the bdrv_truncate() to below this, which
would make it's error path less bad.

> +        ret = qcow2_mark_clean(bs);
> +        if (ret < 0) {
> +            return ret;
>          }

And this writes them to disk. If we fail, we can keep going on, as the
in-memory information is correct and after a qemu restart, the dirty
flag will trigger a repair.

>      }
>  
> diff --git a/include/block/block.h b/include/block/block.h
> index 341054d..b1f4385 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -498,6 +498,8 @@ typedef enum {
>      BLKDBG_PWRITEV_ZERO,
>      BLKDBG_PWRITEV_DONE,
>  
> +    BLKDBG_EMPTY_IMAGE_PREPARE,
> +
>      BLKDBG_EVENT_MAX,
>  } BlkDebugEvent;

Kevin
Max Reitz Oct. 23, 2014, 7:46 a.m. UTC | #3
On 2014-10-22 at 20:35, Kevin Wolf wrote:
> Am 22.10.2014 um 14:51 hat Max Reitz geschrieben:
>> bdrv_make_empty() is currently only called if the current image
>> represents an external snapshot that has been committed to its base
>> image; it is therefore unlikely to have internal snapshots. In this
>> case, bdrv_make_empty() can be greatly sped up by emptying the L1 and
>> refcount table (while having the dirty flag set, which only works for
>> compat=1.1) and creating a trivial refcount structure.
>>
>> If there are snapshots or for compat=0.10, fall back to the simple
>> implementation (discard all clusters).
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
> Hey, this feels actually reviewable this time. :-)

I'm still unsure which version I like more. If it wasn't for the math, 
I'd prefer the other version.

>> diff --git a/block/blkdebug.c b/block/blkdebug.c
>> index e046b92..862d93b 100644
>> --- a/block/blkdebug.c
>> +++ b/block/blkdebug.c
>> @@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
>>       [BLKDBG_PWRITEV]                        = "pwritev",
>>       [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
>>       [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
>> +
>> +    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
>>   };
>>   
>>   static int get_event_by_name(const char *name, BlkDebugEvent *event)
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index 1ef3a5f..16dece2 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -2232,24 +2232,137 @@ fail:
>>   
>>   static int qcow2_make_empty(BlockDriverState *bs)
>>   {
>> +    BDRVQcowState *s = bs->opaque;
>>       int ret = 0;
>> -    uint64_t start_sector;
>> -    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
>>   
>> -    for (start_sector = 0; start_sector < bs->total_sectors;
>> -         start_sector += sector_step)
>> -    {
>> -        /* As this function is generally used after committing an external
>> -         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
>> -         * default action for this kind of discard is to pass the discard,
>> -         * which will ideally result in an actually smaller image file, as
>> -         * is probably desired. */
>> -        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
>> -                                     MIN(sector_step,
>> -                                         bs->total_sectors - start_sector),
>> -                                     QCOW2_DISCARD_SNAPSHOT, true);
>> +    if (s->snapshots || s->qcow_version < 3) {
>> +        uint64_t start_sector;
>> +        int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
>> +
>> +        /* If there are snapshots, every active cluster has to be discarded; and
>> +         * because compat=0.10 does not support setting the dirty flag, we have
>> +         * to use this fallback there as well */
>> +
>> +        for (start_sector = 0; start_sector < bs->total_sectors;
>> +             start_sector += sector_step)
>> +        {
>> +            /* As this function is generally used after committing an external
>> +             * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
>> +             * default action for this kind of discard is to pass the discard,
>> +             * which will ideally result in an actually smaller image file, as
>> +             * is probably desired. */
>> +            ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
>> +                                         MIN(sector_step,
>> +                                             bs->total_sectors - start_sector),
>> +                                         QCOW2_DISCARD_SNAPSHOT, true);
>> +            if (ret < 0) {
>> +                break;
>> +            }
>> +        }
> My first though was to add a return here, so the indentation level for
> the rest is one less. Probably a matter of taste, though.

I'd rather put the second branch into an own function.

>> +    } else {
>> +        int l1_clusters;
>> +        int64_t offset;
>> +        uint64_t *new_reftable;
>> +        uint64_t rt_entry;
>> +        struct {
>> +            uint64_t l1_offset;
>> +            uint64_t reftable_offset;
>> +            uint32_t reftable_clusters;
>> +        } QEMU_PACKED l1_ofs_rt_ofs_cls;
>> +
>> +        ret = qcow2_cache_empty(bs, s->l2_table_cache);
>>           if (ret < 0) {
>> -            break;
>> +            return ret;
>> +        }
>> +
>> +        ret = qcow2_cache_empty(bs, s->refcount_block_cache);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        /* Refcounts will be broken utterly */
>> +        ret = qcow2_mark_dirty(bs);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +
>> +        l1_clusters = DIV_ROUND_UP(s->l1_size,
>> +                                   s->cluster_size / sizeof(uint64_t));
>> +        new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
>> +        if (!new_reftable) {
>> +            return -ENOMEM;
>> +        }
>> +
>> +        BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
> Until here, the failure cases are trivially okay. The worst thing that
> could happen is that the image is needlessly marked as dirty.
>
>> +        /* Overwrite enough clusters at the beginning of the sectors to place
>> +         * the refcount table, a refcount block and the L1 table in; this may
>> +         * overwrite parts of the existing refcount and L1 table, which is not
>> +         * an issue because the dirty flag is set, complete data loss is in fact
>> +         * desired and partial data loss is consequently fine as well */
>> +        ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
>> +                                (2 + l1_clusters) * s->cluster_size /
>> +                                BDRV_SECTOR_SIZE, 0);
> If we crash at this point, we're _not_ okay any more. --verbose follows:
>
> On disk, we may have overwritten a refcount table or a refcount block
> with zeros. This is fine, we have the dirty flag set, so destroying any
> refcounting structure does no harm.
>
> We may also have overwritten an L1 or L2 table. As the commit correctly
> explains, this is doing partially what the function is supposed to do
> for the whole image. Affected data clusters are now read from the
> backing file. Good.
>
> However, we may also have overwritten data clusters that are still
> accessible using an L1/L2 table that hasn't been hit by this write
> operation. We're reading corrupted (zeroed out) data now instead of
> going to the backing file. Bug!

Oh, right, I forgot about the L1 table not always being at the start of 
the file.

> In my original suggestion I had an item where the L1 table was zeroed
> out first before the start of the image is zeroed. This would have
> avoided the bug.
>
>> +        if (ret < 0) {
>> +            g_free(new_reftable);
>> +            return ret;
>> +        }
> If we fail here (without crashing), the first clusters could be in their
> original state or partially zeroed. Assuming that you fixed the above
> bug, the on-disk state would be okay if we opened the image now because
> the dirty flag would trigger an image repair; but we don't get the
> repair when taking this failure path and we may have zeroed a refcount
> table/block. This is probably a problem and we may have to make the BDS
> unusable.
>
> The in-memory state of the L1 table is hopefully zeroed out, so it's
> consistent with what is on disk.
>
> The in-memory state of the refcount table looks like it's not in sync
> with the on-disk state. Note that while the dirty flag allows that the
> on-disk state can be anything, the in-memory state is what we keep using
> after a failure. The in-memory state isn't accurate at this point, but
> we only create leaks. Lots of them, because we zeroed the L1 table, but
> that's good enough. If refcounts are updated later, the old offsets
> should still be valid.

If we set at least parts of the in-memory reftable to zero, everything 
probably breaks. Try to allocate a new cluster while the beginning of 
the reftable is zero. So we cannot take the on-disk reftable into memory.

Doing it the other way around, writing the in-memory reftable to disk on 
error won't work either. The refblocks may have been zeroed out, so we 
have exactly the same problem.

Therefore, to make the BDS usable after error, we have to (in the error 
path) read the on-disk reftable into memory, call the qcow2 repair 
function and hope for the best.

Or, you know, we could go back to v11 which had my other version of this 
patch which always kept everything consistent. :-P

> All of the following code shouldn't change what a guest is seeing, but
> only fix up the refcounts.
>
>> +        BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
>> +        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
>> +
>> +        /* "Create" an empty reftable (one cluster) directly after the image
>> +         * header and an empty L1 table three clusters after the image header;
>> +         * the cluster between those two will be used as the first refblock */
>> +        cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
>> +        cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
>> +        cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
>> +        ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
>> +                               &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
> If we crash here, we have a new, but still an all-zero L1 table, which
> ensures that the guest sees the right thing. We also get a new refcount
> table, which is invalid anyway. It is completely zeroed out, image
> repair should be able to cope with it.

It now is, yes.

>> +        if (ret < 0) {
>> +            g_free(new_reftable);
>> +            return ret;
>> +        }
> Failure without a crash. It should be safe to assume that either the
> whole header has been updated (and the flush failed) or it's in its old
> state.
>
> Probably the same problem as above with potentially overwritten refcount
> blocks while we don't get a repair on this failure path. Without
> accurate in-memory refcounts, we can't keep using the BDS.
>
>> +        s->l1_table_offset = 3 * s->cluster_size;
>> +        memset(s->l1_table, 0, s->l1_size * sizeof(uint64_t));
> This line has been moved to above by the assumed L1 fix.
>
>> +        s->refcount_table_offset = s->cluster_size;
>> +        s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
>> +
>> +        g_free(s->refcount_table);
>> +        s->refcount_table = new_reftable;
>> +
>> +        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
>> +
>> +        /* Enter the first refblock into the reftable */
>> +        rt_entry = cpu_to_be64(2 * s->cluster_size);
>> +        ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
>> +                               &rt_entry, sizeof(rt_entry));
> On-disk status: L1 table empty, refcount table has a refblock, but still
> no references. Dirty flag ensures we're safe.
>
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
> Failure path without crash: s->refcount_table_offset/size have been
> updated, but we still don't have accurate in-memory information. Still
> can't keep using the BDS after failing here.
>
>> +        s->refcount_table[0] = 2 * s->cluster_size;
>> +
>> +        ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
> Nothing interesting happens here (except that this is the heart of the
> functionality :-)). The on-disk state didn't reference any of the
> truncated clusters any more. The in-memory state isn't fixed yet.
>
>> +        s->free_cluster_index = 0;
>> +        offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size +
>> +                                      s->l1_size * sizeof(uint64_t));
>> +        if (offset < 0) {
>> +            return offset;
> BDS still unusable here.
>
>> +        } else if (offset > 0) {
>> +            error_report("First cluster in emptied image is in use");
>> +            abort();
>> +        }
> Finally, at this point we have correct in-memory information again.
>
> Not sure if it's worth the effort of trying to move this further up. We
> could at least easily move the bdrv_truncate() to below this, which
> would make it's error path less bad.
>
>> +        ret = qcow2_mark_clean(bs);
>> +        if (ret < 0) {
>> +            return ret;
>>           }
> And this writes them to disk. If we fail, we can keep going on, as the
> in-memory information is correct and after a qemu restart, the dirty
> flag will trigger a repair.
>
>>       }
>>   
>> diff --git a/include/block/block.h b/include/block/block.h
>> index 341054d..b1f4385 100644
>> --- a/include/block/block.h
>> +++ b/include/block/block.h
>> @@ -498,6 +498,8 @@ typedef enum {
>>       BLKDBG_PWRITEV_ZERO,
>>       BLKDBG_PWRITEV_DONE,
>>   
>> +    BLKDBG_EMPTY_IMAGE_PREPARE,
>> +
>>       BLKDBG_EVENT_MAX,
>>   } BlkDebugEvent;
> Kevin

Thank you for your review,

Max
Kevin Wolf Oct. 23, 2014, 8:29 a.m. UTC | #4
Am 23.10.2014 um 09:46 hat Max Reitz geschrieben:
> On 2014-10-22 at 20:35, Kevin Wolf wrote:
> >Am 22.10.2014 um 14:51 hat Max Reitz geschrieben:
> >>bdrv_make_empty() is currently only called if the current image
> >>represents an external snapshot that has been committed to its base
> >>image; it is therefore unlikely to have internal snapshots. In this
> >>case, bdrv_make_empty() can be greatly sped up by emptying the L1 and
> >>refcount table (while having the dirty flag set, which only works for
> >>compat=1.1) and creating a trivial refcount structure.
> >>
> >>If there are snapshots or for compat=0.10, fall back to the simple
> >>implementation (discard all clusters).
> >>
> >>Signed-off-by: Max Reitz <mreitz@redhat.com>
> >Hey, this feels actually reviewable this time. :-)
> 
> I'm still unsure which version I like more. If it wasn't for the
> math, I'd prefer the other version.
> 
> >>diff --git a/block/blkdebug.c b/block/blkdebug.c
> >>index e046b92..862d93b 100644
> >>--- a/block/blkdebug.c
> >>+++ b/block/blkdebug.c
> >>@@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
> >>      [BLKDBG_PWRITEV]                        = "pwritev",
> >>      [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
> >>      [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
> >>+
> >>+    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
> >>  };
> >>  static int get_event_by_name(const char *name, BlkDebugEvent *event)
> >>diff --git a/block/qcow2.c b/block/qcow2.c
> >>index 1ef3a5f..16dece2 100644
> >>--- a/block/qcow2.c
> >>+++ b/block/qcow2.c
> >>@@ -2232,24 +2232,137 @@ fail:
> >>  static int qcow2_make_empty(BlockDriverState *bs)
> >>  {
> >>+    BDRVQcowState *s = bs->opaque;
> >>      int ret = 0;
> >>-    uint64_t start_sector;
> >>-    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
> >>-    for (start_sector = 0; start_sector < bs->total_sectors;
> >>-         start_sector += sector_step)
> >>-    {
> >>-        /* As this function is generally used after committing an external
> >>-         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
> >>-         * default action for this kind of discard is to pass the discard,
> >>-         * which will ideally result in an actually smaller image file, as
> >>-         * is probably desired. */
> >>-        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
> >>-                                     MIN(sector_step,
> >>-                                         bs->total_sectors - start_sector),
> >>-                                     QCOW2_DISCARD_SNAPSHOT, true);
> >>+    if (s->snapshots || s->qcow_version < 3) {
> >>+        uint64_t start_sector;
> >>+        int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
> >>+
> >>+        /* If there are snapshots, every active cluster has to be discarded; and
> >>+         * because compat=0.10 does not support setting the dirty flag, we have
> >>+         * to use this fallback there as well */
> >>+
> >>+        for (start_sector = 0; start_sector < bs->total_sectors;
> >>+             start_sector += sector_step)
> >>+        {
> >>+            /* As this function is generally used after committing an external
> >>+             * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
> >>+             * default action for this kind of discard is to pass the discard,
> >>+             * which will ideally result in an actually smaller image file, as
> >>+             * is probably desired. */
> >>+            ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
> >>+                                         MIN(sector_step,
> >>+                                             bs->total_sectors - start_sector),
> >>+                                         QCOW2_DISCARD_SNAPSHOT, true);
> >>+            if (ret < 0) {
> >>+                break;
> >>+            }
> >>+        }
> >My first though was to add a return here, so the indentation level for
> >the rest is one less. Probably a matter of taste, though.
> 
> I'd rather put the second branch into an own function.

Works for me.

> >>+    } else {
> >>+        int l1_clusters;
> >>+        int64_t offset;
> >>+        uint64_t *new_reftable;
> >>+        uint64_t rt_entry;
> >>+        struct {
> >>+            uint64_t l1_offset;
> >>+            uint64_t reftable_offset;
> >>+            uint32_t reftable_clusters;
> >>+        } QEMU_PACKED l1_ofs_rt_ofs_cls;
> >>+
> >>+        ret = qcow2_cache_empty(bs, s->l2_table_cache);
> >>          if (ret < 0) {
> >>-            break;
> >>+            return ret;
> >>+        }
> >>+
> >>+        ret = qcow2_cache_empty(bs, s->refcount_block_cache);
> >>+        if (ret < 0) {
> >>+            return ret;
> >>+        }
> >>+
> >>+        /* Refcounts will be broken utterly */
> >>+        ret = qcow2_mark_dirty(bs);
> >>+        if (ret < 0) {
> >>+            return ret;
> >>+        }
> >>+
> >>+        l1_clusters = DIV_ROUND_UP(s->l1_size,
> >>+                                   s->cluster_size / sizeof(uint64_t));
> >>+        new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
> >>+        if (!new_reftable) {
> >>+            return -ENOMEM;
> >>+        }
> >>+
> >>+        BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
> >Until here, the failure cases are trivially okay. The worst thing that
> >could happen is that the image is needlessly marked as dirty.
> >
> >>+        /* Overwrite enough clusters at the beginning of the sectors to place
> >>+         * the refcount table, a refcount block and the L1 table in; this may
> >>+         * overwrite parts of the existing refcount and L1 table, which is not
> >>+         * an issue because the dirty flag is set, complete data loss is in fact
> >>+         * desired and partial data loss is consequently fine as well */
> >>+        ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
> >>+                                (2 + l1_clusters) * s->cluster_size /
> >>+                                BDRV_SECTOR_SIZE, 0);
> >If we crash at this point, we're _not_ okay any more. --verbose follows:
> >
> >On disk, we may have overwritten a refcount table or a refcount block
> >with zeros. This is fine, we have the dirty flag set, so destroying any
> >refcounting structure does no harm.
> >
> >We may also have overwritten an L1 or L2 table. As the commit correctly
> >explains, this is doing partially what the function is supposed to do
> >for the whole image. Affected data clusters are now read from the
> >backing file. Good.
> >
> >However, we may also have overwritten data clusters that are still
> >accessible using an L1/L2 table that hasn't been hit by this write
> >operation. We're reading corrupted (zeroed out) data now instead of
> >going to the backing file. Bug!
> 
> Oh, right, I forgot about the L1 table not always being at the start
> of the file.
> 
> >In my original suggestion I had an item where the L1 table was zeroed
> >out first before the start of the image is zeroed. This would have
> >avoided the bug.
> >
> >>+        if (ret < 0) {
> >>+            g_free(new_reftable);
> >>+            return ret;
> >>+        }
> >If we fail here (without crashing), the first clusters could be in their
> >original state or partially zeroed. Assuming that you fixed the above
> >bug, the on-disk state would be okay if we opened the image now because
> >the dirty flag would trigger an image repair; but we don't get the
> >repair when taking this failure path and we may have zeroed a refcount
> >table/block. This is probably a problem and we may have to make the BDS
> >unusable.
> >
> >The in-memory state of the L1 table is hopefully zeroed out, so it's
> >consistent with what is on disk.
> >
> >The in-memory state of the refcount table looks like it's not in sync
> >with the on-disk state. Note that while the dirty flag allows that the
> >on-disk state can be anything, the in-memory state is what we keep using
> >after a failure. The in-memory state isn't accurate at this point, but
> >we only create leaks. Lots of them, because we zeroed the L1 table, but
> >that's good enough. If refcounts are updated later, the old offsets
> >should still be valid.
> 
> If we set at least parts of the in-memory reftable to zero,
> everything probably breaks. Try to allocate a new cluster while the
> beginning of the reftable is zero. So we cannot take the on-disk
> reftable into memory.
> 
> Doing it the other way around, writing the in-memory reftable to
> disk on error won't work either. The refblocks may have been zeroed
> out, so we have exactly the same problem.
> 
> Therefore, to make the BDS usable after error, we have to (in the
> error path) read the on-disk reftable into memory, call the qcow2
> repair function and hope for the best.
> 
> Or, you know, we could go back to v11 which had my other version of
> this patch which always kept everything consistent. :-P

I'm not sure how important it is to keep the BDS usable after such an
unlikely error.

I started to write up a suggestion that we could do without
qcow2_alloc_clusters(), but just build up the first refcount block
ourselves. Doesn't really help us, because the new state is not what we
need here, but it made me aware that it assumes that the L1 table is
small enough to fit in the first refcount block and we would have to
fall back to discard if it isn't. Come to think of it, I suspect you
already make the same assumption without checking it.


Anyway, if we want to keep the BDS usable what is the right state?
We need references for the header, for the L1 table, for the refcount
table and all refcount blocks. If we decide to build a new in-memory
refcount table, the refcount blocks are only the refcount blocks that
are still referenced there, i.e. we don't have to worry about the
location of refcount blocks on the disk.

In fact, we can also safely change the refcount table offset to
cluster 1 and handle it together with the header. We'll then need one
refcount block for header/reftable and potentially another one for the
L1 table, which still must be in its original place (adding the
assumption that the L1 table doesn't cross a refblock boundary :-/).

Our refblock cache can hold 4 tables at least, so creating two refblocks
purely in memory is doable.

Leaves the question, is it worth the hassle?

Kevin
Max Reitz Oct. 23, 2014, 8:36 a.m. UTC | #5
On 2014-10-23 at 10:29, Kevin Wolf wrote:
> Am 23.10.2014 um 09:46 hat Max Reitz geschrieben:
>> On 2014-10-22 at 20:35, Kevin Wolf wrote:
>>> Am 22.10.2014 um 14:51 hat Max Reitz geschrieben:
>>>> bdrv_make_empty() is currently only called if the current image
>>>> represents an external snapshot that has been committed to its base
>>>> image; it is therefore unlikely to have internal snapshots. In this
>>>> case, bdrv_make_empty() can be greatly sped up by emptying the L1 and
>>>> refcount table (while having the dirty flag set, which only works for
>>>> compat=1.1) and creating a trivial refcount structure.
>>>>
>>>> If there are snapshots or for compat=0.10, fall back to the simple
>>>> implementation (discard all clusters).
>>>>
>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>> Hey, this feels actually reviewable this time. :-)
>> I'm still unsure which version I like more. If it wasn't for the
>> math, I'd prefer the other version.
>>
>>>> diff --git a/block/blkdebug.c b/block/blkdebug.c
>>>> index e046b92..862d93b 100644
>>>> --- a/block/blkdebug.c
>>>> +++ b/block/blkdebug.c
>>>> @@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
>>>>       [BLKDBG_PWRITEV]                        = "pwritev",
>>>>       [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
>>>>       [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
>>>> +
>>>> +    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
>>>>   };
>>>>   static int get_event_by_name(const char *name, BlkDebugEvent *event)
>>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>>> index 1ef3a5f..16dece2 100644
>>>> --- a/block/qcow2.c
>>>> +++ b/block/qcow2.c
>>>> @@ -2232,24 +2232,137 @@ fail:
>>>>   static int qcow2_make_empty(BlockDriverState *bs)
>>>>   {
>>>> +    BDRVQcowState *s = bs->opaque;
>>>>       int ret = 0;
>>>> -    uint64_t start_sector;
>>>> -    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
>>>> -    for (start_sector = 0; start_sector < bs->total_sectors;
>>>> -         start_sector += sector_step)
>>>> -    {
>>>> -        /* As this function is generally used after committing an external
>>>> -         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
>>>> -         * default action for this kind of discard is to pass the discard,
>>>> -         * which will ideally result in an actually smaller image file, as
>>>> -         * is probably desired. */
>>>> -        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
>>>> -                                     MIN(sector_step,
>>>> -                                         bs->total_sectors - start_sector),
>>>> -                                     QCOW2_DISCARD_SNAPSHOT, true);
>>>> +    if (s->snapshots || s->qcow_version < 3) {
>>>> +        uint64_t start_sector;
>>>> +        int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
>>>> +
>>>> +        /* If there are snapshots, every active cluster has to be discarded; and
>>>> +         * because compat=0.10 does not support setting the dirty flag, we have
>>>> +         * to use this fallback there as well */
>>>> +
>>>> +        for (start_sector = 0; start_sector < bs->total_sectors;
>>>> +             start_sector += sector_step)
>>>> +        {
>>>> +            /* As this function is generally used after committing an external
>>>> +             * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
>>>> +             * default action for this kind of discard is to pass the discard,
>>>> +             * which will ideally result in an actually smaller image file, as
>>>> +             * is probably desired. */
>>>> +            ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
>>>> +                                         MIN(sector_step,
>>>> +                                             bs->total_sectors - start_sector),
>>>> +                                         QCOW2_DISCARD_SNAPSHOT, true);
>>>> +            if (ret < 0) {
>>>> +                break;
>>>> +            }
>>>> +        }
>>> My first though was to add a return here, so the indentation level for
>>> the rest is one less. Probably a matter of taste, though.
>> I'd rather put the second branch into an own function.
> Works for me.
>
>>>> +    } else {
>>>> +        int l1_clusters;
>>>> +        int64_t offset;
>>>> +        uint64_t *new_reftable;
>>>> +        uint64_t rt_entry;
>>>> +        struct {
>>>> +            uint64_t l1_offset;
>>>> +            uint64_t reftable_offset;
>>>> +            uint32_t reftable_clusters;
>>>> +        } QEMU_PACKED l1_ofs_rt_ofs_cls;
>>>> +
>>>> +        ret = qcow2_cache_empty(bs, s->l2_table_cache);
>>>>           if (ret < 0) {
>>>> -            break;
>>>> +            return ret;
>>>> +        }
>>>> +
>>>> +        ret = qcow2_cache_empty(bs, s->refcount_block_cache);
>>>> +        if (ret < 0) {
>>>> +            return ret;
>>>> +        }
>>>> +
>>>> +        /* Refcounts will be broken utterly */
>>>> +        ret = qcow2_mark_dirty(bs);
>>>> +        if (ret < 0) {
>>>> +            return ret;
>>>> +        }
>>>> +
>>>> +        l1_clusters = DIV_ROUND_UP(s->l1_size,
>>>> +                                   s->cluster_size / sizeof(uint64_t));
>>>> +        new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
>>>> +        if (!new_reftable) {
>>>> +            return -ENOMEM;
>>>> +        }
>>>> +
>>>> +        BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
>>> Until here, the failure cases are trivially okay. The worst thing that
>>> could happen is that the image is needlessly marked as dirty.
>>>
>>>> +        /* Overwrite enough clusters at the beginning of the sectors to place
>>>> +         * the refcount table, a refcount block and the L1 table in; this may
>>>> +         * overwrite parts of the existing refcount and L1 table, which is not
>>>> +         * an issue because the dirty flag is set, complete data loss is in fact
>>>> +         * desired and partial data loss is consequently fine as well */
>>>> +        ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
>>>> +                                (2 + l1_clusters) * s->cluster_size /
>>>> +                                BDRV_SECTOR_SIZE, 0);
>>> If we crash at this point, we're _not_ okay any more. --verbose follows:
>>>
>>> On disk, we may have overwritten a refcount table or a refcount block
>>> with zeros. This is fine, we have the dirty flag set, so destroying any
>>> refcounting structure does no harm.
>>>
>>> We may also have overwritten an L1 or L2 table. As the commit correctly
>>> explains, this is doing partially what the function is supposed to do
>>> for the whole image. Affected data clusters are now read from the
>>> backing file. Good.
>>>
>>> However, we may also have overwritten data clusters that are still
>>> accessible using an L1/L2 table that hasn't been hit by this write
>>> operation. We're reading corrupted (zeroed out) data now instead of
>>> going to the backing file. Bug!
>> Oh, right, I forgot about the L1 table not always being at the start
>> of the file.
>>
>>> In my original suggestion I had an item where the L1 table was zeroed
>>> out first before the start of the image is zeroed. This would have
>>> avoided the bug.
>>>
>>>> +        if (ret < 0) {
>>>> +            g_free(new_reftable);
>>>> +            return ret;
>>>> +        }
>>> If we fail here (without crashing), the first clusters could be in their
>>> original state or partially zeroed. Assuming that you fixed the above
>>> bug, the on-disk state would be okay if we opened the image now because
>>> the dirty flag would trigger an image repair; but we don't get the
>>> repair when taking this failure path and we may have zeroed a refcount
>>> table/block. This is probably a problem and we may have to make the BDS
>>> unusable.
>>>
>>> The in-memory state of the L1 table is hopefully zeroed out, so it's
>>> consistent with what is on disk.
>>>
>>> The in-memory state of the refcount table looks like it's not in sync
>>> with the on-disk state. Note that while the dirty flag allows that the
>>> on-disk state can be anything, the in-memory state is what we keep using
>>> after a failure. The in-memory state isn't accurate at this point, but
>>> we only create leaks. Lots of them, because we zeroed the L1 table, but
>>> that's good enough. If refcounts are updated later, the old offsets
>>> should still be valid.
>> If we set at least parts of the in-memory reftable to zero,
>> everything probably breaks. Try to allocate a new cluster while the
>> beginning of the reftable is zero. So we cannot take the on-disk
>> reftable into memory.
>>
>> Doing it the other way around, writing the in-memory reftable to
>> disk on error won't work either. The refblocks may have been zeroed
>> out, so we have exactly the same problem.
>>
>> Therefore, to make the BDS usable after error, we have to (in the
>> error path) read the on-disk reftable into memory, call the qcow2
>> repair function and hope for the best.
>>
>> Or, you know, we could go back to v11 which had my other version of
>> this patch which always kept everything consistent. :-P
> I'm not sure how important it is to keep the BDS usable after such an
> unlikely error.
>
> I started to write up a suggestion that we could do without
> qcow2_alloc_clusters(), but just build up the first refcount block
> ourselves. Doesn't really help us, because the new state is not what we
> need here, but it made me aware that it assumes that the L1 table is
> small enough to fit in the first refcount block and we would have to
> fall back to discard if it isn't. Come to think of it, I suspect you
> already make the same assumption without checking it.
>
>
> Anyway, if we want to keep the BDS usable what is the right state?
> We need references for the header, for the L1 table, for the refcount
> table and all refcount blocks. If we decide to build a new in-memory
> refcount table, the refcount blocks are only the refcount blocks that
> are still referenced there, i.e. we don't have to worry about the
> location of refcount blocks on the disk.
>
> In fact, we can also safely change the refcount table offset to
> cluster 1 and handle it together with the header. We'll then need one
> refcount block for header/reftable and potentially another one for the
> L1 table, which still must be in its original place (adding the
> assumption that the L1 table doesn't cross a refblock boundary :-/).
>
> Our refblock cache can hold 4 tables at least, so creating two refblocks
> purely in memory is doable.
>
> Leaves the question, is it worth the hassle?

Probably not. Would you be fine with setting bs->drv to NULL in the 
problematic error paths?

Max
Kevin Wolf Oct. 23, 2014, 8:41 a.m. UTC | #6
Am 23.10.2014 um 10:36 hat Max Reitz geschrieben:
> On 2014-10-23 at 10:29, Kevin Wolf wrote:
> >Am 23.10.2014 um 09:46 hat Max Reitz geschrieben:
> >>On 2014-10-22 at 20:35, Kevin Wolf wrote:
> >>>Am 22.10.2014 um 14:51 hat Max Reitz geschrieben:
> >>>>bdrv_make_empty() is currently only called if the current image
> >>>>represents an external snapshot that has been committed to its base
> >>>>image; it is therefore unlikely to have internal snapshots. In this
> >>>>case, bdrv_make_empty() can be greatly sped up by emptying the L1 and
> >>>>refcount table (while having the dirty flag set, which only works for
> >>>>compat=1.1) and creating a trivial refcount structure.
> >>>>
> >>>>If there are snapshots or for compat=0.10, fall back to the simple
> >>>>implementation (discard all clusters).
> >>>>
> >>>>Signed-off-by: Max Reitz <mreitz@redhat.com>
> >>>Hey, this feels actually reviewable this time. :-)
> >>I'm still unsure which version I like more. If it wasn't for the
> >>math, I'd prefer the other version.
> >>
> >>>>diff --git a/block/blkdebug.c b/block/blkdebug.c
> >>>>index e046b92..862d93b 100644
> >>>>--- a/block/blkdebug.c
> >>>>+++ b/block/blkdebug.c
> >>>>@@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
> >>>>      [BLKDBG_PWRITEV]                        = "pwritev",
> >>>>      [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
> >>>>      [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
> >>>>+
> >>>>+    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
> >>>>  };
> >>>>  static int get_event_by_name(const char *name, BlkDebugEvent *event)
> >>>>diff --git a/block/qcow2.c b/block/qcow2.c
> >>>>index 1ef3a5f..16dece2 100644
> >>>>--- a/block/qcow2.c
> >>>>+++ b/block/qcow2.c
> >>>>@@ -2232,24 +2232,137 @@ fail:
> >>>>  static int qcow2_make_empty(BlockDriverState *bs)
> >>>>  {
> >>>>+    BDRVQcowState *s = bs->opaque;
> >>>>      int ret = 0;
> >>>>-    uint64_t start_sector;
> >>>>-    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
> >>>>-    for (start_sector = 0; start_sector < bs->total_sectors;
> >>>>-         start_sector += sector_step)
> >>>>-    {
> >>>>-        /* As this function is generally used after committing an external
> >>>>-         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
> >>>>-         * default action for this kind of discard is to pass the discard,
> >>>>-         * which will ideally result in an actually smaller image file, as
> >>>>-         * is probably desired. */
> >>>>-        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
> >>>>-                                     MIN(sector_step,
> >>>>-                                         bs->total_sectors - start_sector),
> >>>>-                                     QCOW2_DISCARD_SNAPSHOT, true);
> >>>>+    if (s->snapshots || s->qcow_version < 3) {
> >>>>+        uint64_t start_sector;
> >>>>+        int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
> >>>>+
> >>>>+        /* If there are snapshots, every active cluster has to be discarded; and
> >>>>+         * because compat=0.10 does not support setting the dirty flag, we have
> >>>>+         * to use this fallback there as well */
> >>>>+
> >>>>+        for (start_sector = 0; start_sector < bs->total_sectors;
> >>>>+             start_sector += sector_step)
> >>>>+        {
> >>>>+            /* As this function is generally used after committing an external
> >>>>+             * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
> >>>>+             * default action for this kind of discard is to pass the discard,
> >>>>+             * which will ideally result in an actually smaller image file, as
> >>>>+             * is probably desired. */
> >>>>+            ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
> >>>>+                                         MIN(sector_step,
> >>>>+                                             bs->total_sectors - start_sector),
> >>>>+                                         QCOW2_DISCARD_SNAPSHOT, true);
> >>>>+            if (ret < 0) {
> >>>>+                break;
> >>>>+            }
> >>>>+        }
> >>>My first though was to add a return here, so the indentation level for
> >>>the rest is one less. Probably a matter of taste, though.
> >>I'd rather put the second branch into an own function.
> >Works for me.
> >
> >>>>+    } else {
> >>>>+        int l1_clusters;
> >>>>+        int64_t offset;
> >>>>+        uint64_t *new_reftable;
> >>>>+        uint64_t rt_entry;
> >>>>+        struct {
> >>>>+            uint64_t l1_offset;
> >>>>+            uint64_t reftable_offset;
> >>>>+            uint32_t reftable_clusters;
> >>>>+        } QEMU_PACKED l1_ofs_rt_ofs_cls;
> >>>>+
> >>>>+        ret = qcow2_cache_empty(bs, s->l2_table_cache);
> >>>>          if (ret < 0) {
> >>>>-            break;
> >>>>+            return ret;
> >>>>+        }
> >>>>+
> >>>>+        ret = qcow2_cache_empty(bs, s->refcount_block_cache);
> >>>>+        if (ret < 0) {
> >>>>+            return ret;
> >>>>+        }
> >>>>+
> >>>>+        /* Refcounts will be broken utterly */
> >>>>+        ret = qcow2_mark_dirty(bs);
> >>>>+        if (ret < 0) {
> >>>>+            return ret;
> >>>>+        }
> >>>>+
> >>>>+        l1_clusters = DIV_ROUND_UP(s->l1_size,
> >>>>+                                   s->cluster_size / sizeof(uint64_t));
> >>>>+        new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
> >>>>+        if (!new_reftable) {
> >>>>+            return -ENOMEM;
> >>>>+        }
> >>>>+
> >>>>+        BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
> >>>Until here, the failure cases are trivially okay. The worst thing that
> >>>could happen is that the image is needlessly marked as dirty.
> >>>
> >>>>+        /* Overwrite enough clusters at the beginning of the sectors to place
> >>>>+         * the refcount table, a refcount block and the L1 table in; this may
> >>>>+         * overwrite parts of the existing refcount and L1 table, which is not
> >>>>+         * an issue because the dirty flag is set, complete data loss is in fact
> >>>>+         * desired and partial data loss is consequently fine as well */
> >>>>+        ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
> >>>>+                                (2 + l1_clusters) * s->cluster_size /
> >>>>+                                BDRV_SECTOR_SIZE, 0);
> >>>If we crash at this point, we're _not_ okay any more. --verbose follows:
> >>>
> >>>On disk, we may have overwritten a refcount table or a refcount block
> >>>with zeros. This is fine, we have the dirty flag set, so destroying any
> >>>refcounting structure does no harm.
> >>>
> >>>We may also have overwritten an L1 or L2 table. As the commit correctly
> >>>explains, this is doing partially what the function is supposed to do
> >>>for the whole image. Affected data clusters are now read from the
> >>>backing file. Good.
> >>>
> >>>However, we may also have overwritten data clusters that are still
> >>>accessible using an L1/L2 table that hasn't been hit by this write
> >>>operation. We're reading corrupted (zeroed out) data now instead of
> >>>going to the backing file. Bug!
> >>Oh, right, I forgot about the L1 table not always being at the start
> >>of the file.
> >>
> >>>In my original suggestion I had an item where the L1 table was zeroed
> >>>out first before the start of the image is zeroed. This would have
> >>>avoided the bug.
> >>>
> >>>>+        if (ret < 0) {
> >>>>+            g_free(new_reftable);
> >>>>+            return ret;
> >>>>+        }
> >>>If we fail here (without crashing), the first clusters could be in their
> >>>original state or partially zeroed. Assuming that you fixed the above
> >>>bug, the on-disk state would be okay if we opened the image now because
> >>>the dirty flag would trigger an image repair; but we don't get the
> >>>repair when taking this failure path and we may have zeroed a refcount
> >>>table/block. This is probably a problem and we may have to make the BDS
> >>>unusable.
> >>>
> >>>The in-memory state of the L1 table is hopefully zeroed out, so it's
> >>>consistent with what is on disk.
> >>>
> >>>The in-memory state of the refcount table looks like it's not in sync
> >>>with the on-disk state. Note that while the dirty flag allows that the
> >>>on-disk state can be anything, the in-memory state is what we keep using
> >>>after a failure. The in-memory state isn't accurate at this point, but
> >>>we only create leaks. Lots of them, because we zeroed the L1 table, but
> >>>that's good enough. If refcounts are updated later, the old offsets
> >>>should still be valid.
> >>If we set at least parts of the in-memory reftable to zero,
> >>everything probably breaks. Try to allocate a new cluster while the
> >>beginning of the reftable is zero. So we cannot take the on-disk
> >>reftable into memory.
> >>
> >>Doing it the other way around, writing the in-memory reftable to
> >>disk on error won't work either. The refblocks may have been zeroed
> >>out, so we have exactly the same problem.
> >>
> >>Therefore, to make the BDS usable after error, we have to (in the
> >>error path) read the on-disk reftable into memory, call the qcow2
> >>repair function and hope for the best.
> >>
> >>Or, you know, we could go back to v11 which had my other version of
> >>this patch which always kept everything consistent. :-P
> >I'm not sure how important it is to keep the BDS usable after such an
> >unlikely error.
> >
> >I started to write up a suggestion that we could do without
> >qcow2_alloc_clusters(), but just build up the first refcount block
> >ourselves. Doesn't really help us, because the new state is not what we
> >need here, but it made me aware that it assumes that the L1 table is
> >small enough to fit in the first refcount block and we would have to
> >fall back to discard if it isn't. Come to think of it, I suspect you
> >already make the same assumption without checking it.
> >
> >
> >Anyway, if we want to keep the BDS usable what is the right state?
> >We need references for the header, for the L1 table, for the refcount
> >table and all refcount blocks. If we decide to build a new in-memory
> >refcount table, the refcount blocks are only the refcount blocks that
> >are still referenced there, i.e. we don't have to worry about the
> >location of refcount blocks on the disk.
> >
> >In fact, we can also safely change the refcount table offset to
> >cluster 1 and handle it together with the header. We'll then need one
> >refcount block for header/reftable and potentially another one for the
> >L1 table, which still must be in its original place (adding the
> >assumption that the L1 table doesn't cross a refblock boundary :-/).
> >
> >Our refblock cache can hold 4 tables at least, so creating two refblocks
> >purely in memory is doable.
> >
> >Leaves the question, is it worth the hassle?
> 
> Probably not. Would you be fine with setting bs->drv to NULL in the
> problematic error paths?

By calling qcow2_signal_corruption(), right? I think that's fine.

What about the assumption that 3 + l1_size fits in one refcount block?
Do we need to check it even now?

Kevin
Max Reitz Oct. 23, 2014, 9:11 a.m. UTC | #7
On 2014-10-23 at 10:41, Kevin Wolf wrote:
> Am 23.10.2014 um 10:36 hat Max Reitz geschrieben:
>> On 2014-10-23 at 10:29, Kevin Wolf wrote:
>>> Am 23.10.2014 um 09:46 hat Max Reitz geschrieben:
>>>> On 2014-10-22 at 20:35, Kevin Wolf wrote:
>>>>> Am 22.10.2014 um 14:51 hat Max Reitz geschrieben:
>>>>>> bdrv_make_empty() is currently only called if the current image
>>>>>> represents an external snapshot that has been committed to its base
>>>>>> image; it is therefore unlikely to have internal snapshots. In this
>>>>>> case, bdrv_make_empty() can be greatly sped up by emptying the L1 and
>>>>>> refcount table (while having the dirty flag set, which only works for
>>>>>> compat=1.1) and creating a trivial refcount structure.
>>>>>>
>>>>>> If there are snapshots or for compat=0.10, fall back to the simple
>>>>>> implementation (discard all clusters).
>>>>>>
>>>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>>>> Hey, this feels actually reviewable this time. :-)
>>>> I'm still unsure which version I like more. If it wasn't for the
>>>> math, I'd prefer the other version.
>>>>
>>>>>> diff --git a/block/blkdebug.c b/block/blkdebug.c
>>>>>> index e046b92..862d93b 100644
>>>>>> --- a/block/blkdebug.c
>>>>>> +++ b/block/blkdebug.c
>>>>>> @@ -195,6 +195,8 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
>>>>>>       [BLKDBG_PWRITEV]                        = "pwritev",
>>>>>>       [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
>>>>>>       [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
>>>>>> +
>>>>>> +    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
>>>>>>   };
>>>>>>   static int get_event_by_name(const char *name, BlkDebugEvent *event)
>>>>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>>>>> index 1ef3a5f..16dece2 100644
>>>>>> --- a/block/qcow2.c
>>>>>> +++ b/block/qcow2.c
>>>>>> @@ -2232,24 +2232,137 @@ fail:
>>>>>>   static int qcow2_make_empty(BlockDriverState *bs)
>>>>>>   {
>>>>>> +    BDRVQcowState *s = bs->opaque;
>>>>>>       int ret = 0;
>>>>>> -    uint64_t start_sector;
>>>>>> -    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
>>>>>> -    for (start_sector = 0; start_sector < bs->total_sectors;
>>>>>> -         start_sector += sector_step)
>>>>>> -    {
>>>>>> -        /* As this function is generally used after committing an external
>>>>>> -         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
>>>>>> -         * default action for this kind of discard is to pass the discard,
>>>>>> -         * which will ideally result in an actually smaller image file, as
>>>>>> -         * is probably desired. */
>>>>>> -        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
>>>>>> -                                     MIN(sector_step,
>>>>>> -                                         bs->total_sectors - start_sector),
>>>>>> -                                     QCOW2_DISCARD_SNAPSHOT, true);
>>>>>> +    if (s->snapshots || s->qcow_version < 3) {
>>>>>> +        uint64_t start_sector;
>>>>>> +        int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
>>>>>> +
>>>>>> +        /* If there are snapshots, every active cluster has to be discarded; and
>>>>>> +         * because compat=0.10 does not support setting the dirty flag, we have
>>>>>> +         * to use this fallback there as well */
>>>>>> +
>>>>>> +        for (start_sector = 0; start_sector < bs->total_sectors;
>>>>>> +             start_sector += sector_step)
>>>>>> +        {
>>>>>> +            /* As this function is generally used after committing an external
>>>>>> +             * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
>>>>>> +             * default action for this kind of discard is to pass the discard,
>>>>>> +             * which will ideally result in an actually smaller image file, as
>>>>>> +             * is probably desired. */
>>>>>> +            ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
>>>>>> +                                         MIN(sector_step,
>>>>>> +                                             bs->total_sectors - start_sector),
>>>>>> +                                         QCOW2_DISCARD_SNAPSHOT, true);
>>>>>> +            if (ret < 0) {
>>>>>> +                break;
>>>>>> +            }
>>>>>> +        }
>>>>> My first though was to add a return here, so the indentation level for
>>>>> the rest is one less. Probably a matter of taste, though.
>>>> I'd rather put the second branch into an own function.
>>> Works for me.
>>>
>>>>>> +    } else {
>>>>>> +        int l1_clusters;
>>>>>> +        int64_t offset;
>>>>>> +        uint64_t *new_reftable;
>>>>>> +        uint64_t rt_entry;
>>>>>> +        struct {
>>>>>> +            uint64_t l1_offset;
>>>>>> +            uint64_t reftable_offset;
>>>>>> +            uint32_t reftable_clusters;
>>>>>> +        } QEMU_PACKED l1_ofs_rt_ofs_cls;
>>>>>> +
>>>>>> +        ret = qcow2_cache_empty(bs, s->l2_table_cache);
>>>>>>           if (ret < 0) {
>>>>>> -            break;
>>>>>> +            return ret;
>>>>>> +        }
>>>>>> +
>>>>>> +        ret = qcow2_cache_empty(bs, s->refcount_block_cache);
>>>>>> +        if (ret < 0) {
>>>>>> +            return ret;
>>>>>> +        }
>>>>>> +
>>>>>> +        /* Refcounts will be broken utterly */
>>>>>> +        ret = qcow2_mark_dirty(bs);
>>>>>> +        if (ret < 0) {
>>>>>> +            return ret;
>>>>>> +        }
>>>>>> +
>>>>>> +        l1_clusters = DIV_ROUND_UP(s->l1_size,
>>>>>> +                                   s->cluster_size / sizeof(uint64_t));
>>>>>> +        new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
>>>>>> +        if (!new_reftable) {
>>>>>> +            return -ENOMEM;
>>>>>> +        }
>>>>>> +
>>>>>> +        BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
>>>>> Until here, the failure cases are trivially okay. The worst thing that
>>>>> could happen is that the image is needlessly marked as dirty.
>>>>>
>>>>>> +        /* Overwrite enough clusters at the beginning of the sectors to place
>>>>>> +         * the refcount table, a refcount block and the L1 table in; this may
>>>>>> +         * overwrite parts of the existing refcount and L1 table, which is not
>>>>>> +         * an issue because the dirty flag is set, complete data loss is in fact
>>>>>> +         * desired and partial data loss is consequently fine as well */
>>>>>> +        ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
>>>>>> +                                (2 + l1_clusters) * s->cluster_size /
>>>>>> +                                BDRV_SECTOR_SIZE, 0);
>>>>> If we crash at this point, we're _not_ okay any more. --verbose follows:
>>>>>
>>>>> On disk, we may have overwritten a refcount table or a refcount block
>>>>> with zeros. This is fine, we have the dirty flag set, so destroying any
>>>>> refcounting structure does no harm.
>>>>>
>>>>> We may also have overwritten an L1 or L2 table. As the commit correctly
>>>>> explains, this is doing partially what the function is supposed to do
>>>>> for the whole image. Affected data clusters are now read from the
>>>>> backing file. Good.
>>>>>
>>>>> However, we may also have overwritten data clusters that are still
>>>>> accessible using an L1/L2 table that hasn't been hit by this write
>>>>> operation. We're reading corrupted (zeroed out) data now instead of
>>>>> going to the backing file. Bug!
>>>> Oh, right, I forgot about the L1 table not always being at the start
>>>> of the file.
>>>>
>>>>> In my original suggestion I had an item where the L1 table was zeroed
>>>>> out first before the start of the image is zeroed. This would have
>>>>> avoided the bug.
>>>>>
>>>>>> +        if (ret < 0) {
>>>>>> +            g_free(new_reftable);
>>>>>> +            return ret;
>>>>>> +        }
>>>>> If we fail here (without crashing), the first clusters could be in their
>>>>> original state or partially zeroed. Assuming that you fixed the above
>>>>> bug, the on-disk state would be okay if we opened the image now because
>>>>> the dirty flag would trigger an image repair; but we don't get the
>>>>> repair when taking this failure path and we may have zeroed a refcount
>>>>> table/block. This is probably a problem and we may have to make the BDS
>>>>> unusable.
>>>>>
>>>>> The in-memory state of the L1 table is hopefully zeroed out, so it's
>>>>> consistent with what is on disk.
>>>>>
>>>>> The in-memory state of the refcount table looks like it's not in sync
>>>>> with the on-disk state. Note that while the dirty flag allows that the
>>>>> on-disk state can be anything, the in-memory state is what we keep using
>>>>> after a failure. The in-memory state isn't accurate at this point, but
>>>>> we only create leaks. Lots of them, because we zeroed the L1 table, but
>>>>> that's good enough. If refcounts are updated later, the old offsets
>>>>> should still be valid.
>>>> If we set at least parts of the in-memory reftable to zero,
>>>> everything probably breaks. Try to allocate a new cluster while the
>>>> beginning of the reftable is zero. So we cannot take the on-disk
>>>> reftable into memory.
>>>>
>>>> Doing it the other way around, writing the in-memory reftable to
>>>> disk on error won't work either. The refblocks may have been zeroed
>>>> out, so we have exactly the same problem.
>>>>
>>>> Therefore, to make the BDS usable after error, we have to (in the
>>>> error path) read the on-disk reftable into memory, call the qcow2
>>>> repair function and hope for the best.
>>>>
>>>> Or, you know, we could go back to v11 which had my other version of
>>>> this patch which always kept everything consistent. :-P
>>> I'm not sure how important it is to keep the BDS usable after such an
>>> unlikely error.
>>>
>>> I started to write up a suggestion that we could do without
>>> qcow2_alloc_clusters(), but just build up the first refcount block
>>> ourselves. Doesn't really help us, because the new state is not what we
>>> need here, but it made me aware that it assumes that the L1 table is
>>> small enough to fit in the first refcount block and we would have to
>>> fall back to discard if it isn't. Come to think of it, I suspect you
>>> already make the same assumption without checking it.
>>>
>>>
>>> Anyway, if we want to keep the BDS usable what is the right state?
>>> We need references for the header, for the L1 table, for the refcount
>>> table and all refcount blocks. If we decide to build a new in-memory
>>> refcount table, the refcount blocks are only the refcount blocks that
>>> are still referenced there, i.e. we don't have to worry about the
>>> location of refcount blocks on the disk.
>>>
>>> In fact, we can also safely change the refcount table offset to
>>> cluster 1 and handle it together with the header. We'll then need one
>>> refcount block for header/reftable and potentially another one for the
>>> L1 table, which still must be in its original place (adding the
>>> assumption that the L1 table doesn't cross a refblock boundary :-/).
>>>
>>> Our refblock cache can hold 4 tables at least, so creating two refblocks
>>> purely in memory is doable.
>>>
>>> Leaves the question, is it worth the hassle?
>> Probably not. Would you be fine with setting bs->drv to NULL in the
>> problematic error paths?
> By calling qcow2_signal_corruption(), right? I think that's fine.

I don't know. The image isn't corrupted, it's just dirty. When you open 
it, it'll work fine. I'd just output an own error here and set bs->drv 
to NULL.

> What about the assumption that 3 + l1_size fits in one refcount block?
> Do we need to check it even now?

The alternative would be not to allocate 3 + l1_size, but rather just 3. 
Then we have a working refcount structure and that's the most important 
thing (the L1 table is not yet accounted for, but repair can fix that).

Afterwards, we allocate the L1 table. With this change, we don't know 
its offset yet, so we may have to change it again. But we can easily do 
that by just setting s->l1_size to 0 and call qcow2_grow_l1_table().

Max
Kevin Wolf Oct. 23, 2014, 9:42 a.m. UTC | #8
Am 23.10.2014 um 11:11 hat Max Reitz geschrieben:
> >>>Leaves the question, is it worth the hassle?
> >>Probably not. Would you be fine with setting bs->drv to NULL in the
> >>problematic error paths?
> >By calling qcow2_signal_corruption(), right? I think that's fine.
> 
> I don't know. The image isn't corrupted, it's just dirty. When you
> open it, it'll work fine. I'd just output an own error here and set
> bs->drv to NULL.

Okay.

> >What about the assumption that 3 + l1_size fits in one refcount block?
> >Do we need to check it even now?
> 
> The alternative would be not to allocate 3 + l1_size, but rather
> just 3. Then we have a working refcount structure and that's the
> most important thing (the L1 table is not yet accounted for, but
> repair can fix that).
> 
> Afterwards, we allocate the L1 table. With this change, we don't
> know its offset yet, so we may have to change it again. But we can
> easily do that by just setting s->l1_size to 0 and call
> qcow2_grow_l1_table().

How do you make sure that the L1 table always stays zeroed then?

And seriously, if you have an L1 table that doesn't fit in one refcount
block, you're doing something wrong and deserve the slow fallback.

Or, if we really want, we could zero out 2 + l1_size + num_rb clusters
and hook up multiple refcount blocks. The new limit would then be at the
point where the refcount table exceeds a cluster.

Kevin
Max Reitz Oct. 23, 2014, 9:44 a.m. UTC | #9
On 2014-10-23 at 11:42, Kevin Wolf wrote:
> Am 23.10.2014 um 11:11 hat Max Reitz geschrieben:
>>>>> Leaves the question, is it worth the hassle?
>>>> Probably not. Would you be fine with setting bs->drv to NULL in the
>>>> problematic error paths?
>>> By calling qcow2_signal_corruption(), right? I think that's fine.
>> I don't know. The image isn't corrupted, it's just dirty. When you
>> open it, it'll work fine. I'd just output an own error here and set
>> bs->drv to NULL.
> Okay.
>
>>> What about the assumption that 3 + l1_size fits in one refcount block?
>>> Do we need to check it even now?
>> The alternative would be not to allocate 3 + l1_size, but rather
>> just 3. Then we have a working refcount structure and that's the
>> most important thing (the L1 table is not yet accounted for, but
>> repair can fix that).
>>
>> Afterwards, we allocate the L1 table. With this change, we don't
>> know its offset yet, so we may have to change it again. But we can
>> easily do that by just setting s->l1_size to 0 and call
>> qcow2_grow_l1_table().
> How do you make sure that the L1 table always stays zeroed then?

Details...

> And seriously, if you have an L1 table that doesn't fit in one refcount
> block, you're doing something wrong and deserve the slow fallback.

Probably so, yes.

> Or, if we really want, we could zero out 2 + l1_size + num_rb clusters
> and hook up multiple refcount blocks. The new limit would then be at the
> point where the refcount table exceeds a cluster.

Well, if we're falling back to maths, v11 is always there. :-P

Max
diff mbox

Patch

diff --git a/block/blkdebug.c b/block/blkdebug.c
index e046b92..862d93b 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -195,6 +195,8 @@  static const char *event_names[BLKDBG_EVENT_MAX] = {
     [BLKDBG_PWRITEV]                        = "pwritev",
     [BLKDBG_PWRITEV_ZERO]                   = "pwritev_zero",
     [BLKDBG_PWRITEV_DONE]                   = "pwritev_done",
+
+    [BLKDBG_EMPTY_IMAGE_PREPARE]            = "empty_image_prepare",
 };
 
 static int get_event_by_name(const char *name, BlkDebugEvent *event)
diff --git a/block/qcow2.c b/block/qcow2.c
index 1ef3a5f..16dece2 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2232,24 +2232,137 @@  fail:
 
 static int qcow2_make_empty(BlockDriverState *bs)
 {
+    BDRVQcowState *s = bs->opaque;
     int ret = 0;
-    uint64_t start_sector;
-    int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
 
-    for (start_sector = 0; start_sector < bs->total_sectors;
-         start_sector += sector_step)
-    {
-        /* As this function is generally used after committing an external
-         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
-         * default action for this kind of discard is to pass the discard,
-         * which will ideally result in an actually smaller image file, as
-         * is probably desired. */
-        ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
-                                     MIN(sector_step,
-                                         bs->total_sectors - start_sector),
-                                     QCOW2_DISCARD_SNAPSHOT, true);
+    if (s->snapshots || s->qcow_version < 3) {
+        uint64_t start_sector;
+        int sector_step = INT_MAX / BDRV_SECTOR_SIZE;
+
+        /* If there are snapshots, every active cluster has to be discarded; and
+         * because compat=0.10 does not support setting the dirty flag, we have
+         * to use this fallback there as well */
+
+        for (start_sector = 0; start_sector < bs->total_sectors;
+             start_sector += sector_step)
+        {
+            /* As this function is generally used after committing an external
+             * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
+             * default action for this kind of discard is to pass the discard,
+             * which will ideally result in an actually smaller image file, as
+             * is probably desired. */
+            ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
+                                         MIN(sector_step,
+                                             bs->total_sectors - start_sector),
+                                         QCOW2_DISCARD_SNAPSHOT, true);
+            if (ret < 0) {
+                break;
+            }
+        }
+    } else {
+        int l1_clusters;
+        int64_t offset;
+        uint64_t *new_reftable;
+        uint64_t rt_entry;
+        struct {
+            uint64_t l1_offset;
+            uint64_t reftable_offset;
+            uint32_t reftable_clusters;
+        } QEMU_PACKED l1_ofs_rt_ofs_cls;
+
+        ret = qcow2_cache_empty(bs, s->l2_table_cache);
         if (ret < 0) {
-            break;
+            return ret;
+        }
+
+        ret = qcow2_cache_empty(bs, s->refcount_block_cache);
+        if (ret < 0) {
+            return ret;
+        }
+
+        /* Refcounts will be broken utterly */
+        ret = qcow2_mark_dirty(bs);
+        if (ret < 0) {
+            return ret;
+        }
+
+        l1_clusters = DIV_ROUND_UP(s->l1_size,
+                                   s->cluster_size / sizeof(uint64_t));
+        new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t));
+        if (!new_reftable) {
+            return -ENOMEM;
+        }
+
+        BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
+
+        /* Overwrite enough clusters at the beginning of the sectors to place
+         * the refcount table, a refcount block and the L1 table in; this may
+         * overwrite parts of the existing refcount and L1 table, which is not
+         * an issue because the dirty flag is set, complete data loss is in fact
+         * desired and partial data loss is consequently fine as well */
+        ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE,
+                                (2 + l1_clusters) * s->cluster_size /
+                                BDRV_SECTOR_SIZE, 0);
+        if (ret < 0) {
+            g_free(new_reftable);
+            return ret;
+        }
+
+        BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
+        BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
+
+        /* "Create" an empty reftable (one cluster) directly after the image
+         * header and an empty L1 table three clusters after the image header;
+         * the cluster between those two will be used as the first refblock */
+        cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size);
+        cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size);
+        cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1);
+        ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
+                               &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls));
+        if (ret < 0) {
+            g_free(new_reftable);
+            return ret;
+        }
+
+        s->l1_table_offset = 3 * s->cluster_size;
+        memset(s->l1_table, 0, s->l1_size * sizeof(uint64_t));
+
+        s->refcount_table_offset = s->cluster_size;
+        s->refcount_table_size   = s->cluster_size / sizeof(uint64_t);
+
+        g_free(s->refcount_table);
+        s->refcount_table = new_reftable;
+
+        BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
+
+        /* Enter the first refblock into the reftable */
+        rt_entry = cpu_to_be64(2 * s->cluster_size);
+        ret = bdrv_pwrite_sync(bs->file, s->cluster_size,
+                               &rt_entry, sizeof(rt_entry));
+        if (ret < 0) {
+            return ret;
+        }
+
+        s->refcount_table[0] = 2 * s->cluster_size;
+
+        ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size);
+        if (ret < 0) {
+            return ret;
+        }
+
+        s->free_cluster_index = 0;
+        offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size +
+                                      s->l1_size * sizeof(uint64_t));
+        if (offset < 0) {
+            return offset;
+        } else if (offset > 0) {
+            error_report("First cluster in emptied image is in use");
+            abort();
+        }
+
+        ret = qcow2_mark_clean(bs);
+        if (ret < 0) {
+            return ret;
         }
     }
 
diff --git a/include/block/block.h b/include/block/block.h
index 341054d..b1f4385 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -498,6 +498,8 @@  typedef enum {
     BLKDBG_PWRITEV_ZERO,
     BLKDBG_PWRITEV_DONE,
 
+    BLKDBG_EMPTY_IMAGE_PREPARE,
+
     BLKDBG_EVENT_MAX,
 } BlkDebugEvent;