diff mbox

[v5,3/4] qcow2: add shrink image support

Message ID 20170712114700.20008-4-pbutsykin@virtuozzo.com
State New
Headers show

Commit Message

Pavel Butsykin July 12, 2017, 11:46 a.m. UTC
This patch add shrinking of the image file for qcow2. As a result, this allows
us to reduce the virtual image size and free up space on the disk without
copying the image. Image can be fragmented and shrink is done by punching holes
in the image file.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c  |  40 ++++++++++++++++++
 block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.c          |  43 +++++++++++++++----
 block/qcow2.h          |  14 +++++++
 qapi/block-core.json   |   3 +-
 5 files changed, 200 insertions(+), 10 deletions(-)

Comments

Kevin Wolf July 12, 2017, 2:52 p.m. UTC | #1
Am 12.07.2017 um 13:46 hat Pavel Butsykin geschrieben:
> This patch add shrinking of the image file for qcow2. As a result, this allows
> us to reduce the virtual image size and free up space on the disk without
> copying the image. Image can be fragmented and shrink is done by punching holes
> in the image file.
> 
> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
> Reviewed-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/qcow2-cluster.c  |  40 ++++++++++++++++++
>  block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2.c          |  43 +++++++++++++++----
>  block/qcow2.h          |  14 +++++++
>  qapi/block-core.json   |   3 +-
>  5 files changed, 200 insertions(+), 10 deletions(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index f06c08f64c..518429c64b 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -32,6 +32,46 @@
>  #include "qemu/bswap.h"
>  #include "trace.h"
>  
> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
> +{
> +    BDRVQcow2State *s = bs->opaque;
> +    int new_l1_size, i, ret;
> +
> +    if (exact_size >= s->l1_size) {
> +        return 0;
> +    }
> +
> +    new_l1_size = exact_size;
> +
> +#ifdef DEBUG_ALLOC2
> +    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
> +#endif
> +
> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
> +                                       sizeof(uint64_t) * new_l1_size,
> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_flush(bs->file->bs);
> +    if (ret < 0) {
> +        return ret;
> +    }

If we have an error here (or after a partial bdrv_pwrite_zeroes()), we
have entries zeroed out on disk, but in memory we still have the
original L1 table.

Should the in-memory L1 table be zeroed first? Then we can't
accidentally reuse stale entries, but would have to allocate new ones,
which would get on-disk state and in-memory state back in sync again.

> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
> +            continue;
> +        }
> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
> +                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
> +        s->l1_table[i] = 0;
> +    }
> +    return 0;
> +}
> +
>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>                          bool exact_size)
>  {

I haven't checked qcow2_shrink_reftable() for similar kinds of problems,
I hope Max has.

Kevin
Max Reitz July 12, 2017, 4:58 p.m. UTC | #2
On 2017-07-12 16:52, Kevin Wolf wrote:
> Am 12.07.2017 um 13:46 hat Pavel Butsykin geschrieben:
>> This patch add shrinking of the image file for qcow2. As a result, this allows
>> us to reduce the virtual image size and free up space on the disk without
>> copying the image. Image can be fragmented and shrink is done by punching holes
>> in the image file.
>>
>> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>> ---
>>  block/qcow2-cluster.c  |  40 ++++++++++++++++++
>>  block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  block/qcow2.c          |  43 +++++++++++++++----
>>  block/qcow2.h          |  14 +++++++
>>  qapi/block-core.json   |   3 +-
>>  5 files changed, 200 insertions(+), 10 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index f06c08f64c..518429c64b 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -32,6 +32,46 @@
>>  #include "qemu/bswap.h"
>>  #include "trace.h"
>>  
>> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
>> +{
>> +    BDRVQcow2State *s = bs->opaque;
>> +    int new_l1_size, i, ret;
>> +
>> +    if (exact_size >= s->l1_size) {
>> +        return 0;
>> +    }
>> +
>> +    new_l1_size = exact_size;
>> +
>> +#ifdef DEBUG_ALLOC2
>> +    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
>> +#endif
>> +
>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
>> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
>> +                                       sizeof(uint64_t) * new_l1_size,
>> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_flush(bs->file->bs);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
> 
> If we have an error here (or after a partial bdrv_pwrite_zeroes()), we
> have entries zeroed out on disk, but in memory we still have the
> original L1 table.
> 
> Should the in-memory L1 table be zeroed first? Then we can't
> accidentally reuse stale entries, but would have to allocate new ones,
> which would get on-disk state and in-memory state back in sync again.

Yes, I thought of the same.  But this implies that the allocation is
able to modify the L1 table, and I find that unlikely if that
bdrv_flush() failed already...

So I concluded not to have an opinion on which order is better.

>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
>> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
>> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
>> +            continue;
>> +        }
>> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
>> +                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
>> +        s->l1_table[i] = 0;
>> +    }
>> +    return 0;
>> +}
>> +
>>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>                          bool exact_size)
>>  {
> 
> I haven't checked qcow2_shrink_reftable() for similar kinds of problems,
> I hope Max has.

Well, it's exactly the same there.

Max
Kevin Wolf July 13, 2017, 8:41 a.m. UTC | #3
Am 12.07.2017 um 18:58 hat Max Reitz geschrieben:
> On 2017-07-12 16:52, Kevin Wolf wrote:
> > Am 12.07.2017 um 13:46 hat Pavel Butsykin geschrieben:
> >> This patch add shrinking of the image file for qcow2. As a result, this allows
> >> us to reduce the virtual image size and free up space on the disk without
> >> copying the image. Image can be fragmented and shrink is done by punching holes
> >> in the image file.
> >>
> >> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
> >> Reviewed-by: Max Reitz <mreitz@redhat.com>
> >> ---
> >>  block/qcow2-cluster.c  |  40 ++++++++++++++++++
> >>  block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>  block/qcow2.c          |  43 +++++++++++++++----
> >>  block/qcow2.h          |  14 +++++++
> >>  qapi/block-core.json   |   3 +-
> >>  5 files changed, 200 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> >> index f06c08f64c..518429c64b 100644
> >> --- a/block/qcow2-cluster.c
> >> +++ b/block/qcow2-cluster.c
> >> @@ -32,6 +32,46 @@
> >>  #include "qemu/bswap.h"
> >>  #include "trace.h"
> >>  
> >> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
> >> +{
> >> +    BDRVQcow2State *s = bs->opaque;
> >> +    int new_l1_size, i, ret;
> >> +
> >> +    if (exact_size >= s->l1_size) {
> >> +        return 0;
> >> +    }
> >> +
> >> +    new_l1_size = exact_size;
> >> +
> >> +#ifdef DEBUG_ALLOC2
> >> +    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
> >> +#endif
> >> +
> >> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
> >> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
> >> +                                       sizeof(uint64_t) * new_l1_size,
> >> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
> >> +    if (ret < 0) {
> >> +        return ret;
> >> +    }
> >> +
> >> +    ret = bdrv_flush(bs->file->bs);
> >> +    if (ret < 0) {
> >> +        return ret;
> >> +    }
> > 
> > If we have an error here (or after a partial bdrv_pwrite_zeroes()), we
> > have entries zeroed out on disk, but in memory we still have the
> > original L1 table.
> > 
> > Should the in-memory L1 table be zeroed first? Then we can't
> > accidentally reuse stale entries, but would have to allocate new ones,
> > which would get on-disk state and in-memory state back in sync again.
> 
> Yes, I thought of the same.  But this implies that the allocation is
> able to modify the L1 table, and I find that unlikely if that
> bdrv_flush() failed already...
> 
> So I concluded not to have an opinion on which order is better.

Well, let me ask the other way round: Is there any disadvantage in first
zeroing the in-memory table and then writing to the image?

If I have a choice between "always safe" and "not completely safe, but I
think it's unlikely to happen", especially in image formats, then I will
certainly take the "always safe".

> >> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
> >> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
> >> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
> >> +            continue;
> >> +        }
> >> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
> >> +                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
> >> +        s->l1_table[i] = 0;
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
> >>                          bool exact_size)
> >>  {
> > 
> > I haven't checked qcow2_shrink_reftable() for similar kinds of problems,
> > I hope Max has.
> 
> Well, it's exactly the same there.

Ok, so I'll object to this code without really having looked at it.

Kevin
Max Reitz July 13, 2017, 2:36 p.m. UTC | #4
On 2017-07-13 10:41, Kevin Wolf wrote:
> Am 12.07.2017 um 18:58 hat Max Reitz geschrieben:
>> On 2017-07-12 16:52, Kevin Wolf wrote:
>>> Am 12.07.2017 um 13:46 hat Pavel Butsykin geschrieben:
>>>> This patch add shrinking of the image file for qcow2. As a result, this allows
>>>> us to reduce the virtual image size and free up space on the disk without
>>>> copying the image. Image can be fragmented and shrink is done by punching holes
>>>> in the image file.
>>>>
>>>> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>> ---
>>>>  block/qcow2-cluster.c  |  40 ++++++++++++++++++
>>>>  block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  block/qcow2.c          |  43 +++++++++++++++----
>>>>  block/qcow2.h          |  14 +++++++
>>>>  qapi/block-core.json   |   3 +-
>>>>  5 files changed, 200 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>>>> index f06c08f64c..518429c64b 100644
>>>> --- a/block/qcow2-cluster.c
>>>> +++ b/block/qcow2-cluster.c
>>>> @@ -32,6 +32,46 @@
>>>>  #include "qemu/bswap.h"
>>>>  #include "trace.h"
>>>>  
>>>> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
>>>> +{
>>>> +    BDRVQcow2State *s = bs->opaque;
>>>> +    int new_l1_size, i, ret;
>>>> +
>>>> +    if (exact_size >= s->l1_size) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    new_l1_size = exact_size;
>>>> +
>>>> +#ifdef DEBUG_ALLOC2
>>>> +    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
>>>> +#endif
>>>> +
>>>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
>>>> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
>>>> +                                       sizeof(uint64_t) * new_l1_size,
>>>> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
>>>> +    if (ret < 0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = bdrv_flush(bs->file->bs);
>>>> +    if (ret < 0) {
>>>> +        return ret;
>>>> +    }
>>>
>>> If we have an error here (or after a partial bdrv_pwrite_zeroes()), we
>>> have entries zeroed out on disk, but in memory we still have the
>>> original L1 table.
>>>
>>> Should the in-memory L1 table be zeroed first? Then we can't
>>> accidentally reuse stale entries, but would have to allocate new ones,
>>> which would get on-disk state and in-memory state back in sync again.
>>
>> Yes, I thought of the same.  But this implies that the allocation is
>> able to modify the L1 table, and I find that unlikely if that
>> bdrv_flush() failed already...
>>
>> So I concluded not to have an opinion on which order is better.
> 
> Well, let me ask the other way round: Is there any disadvantage in first
> zeroing the in-memory table and then writing to the image?

I was informed that the code would be harder to write. :-)

> If I have a choice between "always safe" and "not completely safe, but I
> think it's unlikely to happen", especially in image formats, then I will
> certainly take the "always safe".
> 
>>>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
>>>> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
>>>> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
>>>> +            continue;
>>>> +        }
>>>> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
>>>> +                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
>>>> +        s->l1_table[i] = 0;
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>>>                          bool exact_size)
>>>>  {
>>>
>>> I haven't checked qcow2_shrink_reftable() for similar kinds of problems,
>>> I hope Max has.
>>
>> Well, it's exactly the same there.
> 
> Ok, so I'll object to this code without really having looked at it.
I won't object to your objection. O:-)

Max
Pavel Butsykin July 13, 2017, 5:18 p.m. UTC | #5
On 13.07.2017 11:41, Kevin Wolf wrote:
> Am 12.07.2017 um 18:58 hat Max Reitz geschrieben:
>> On 2017-07-12 16:52, Kevin Wolf wrote:
>>> Am 12.07.2017 um 13:46 hat Pavel Butsykin geschrieben:
>>>> This patch add shrinking of the image file for qcow2. As a result, this allows
>>>> us to reduce the virtual image size and free up space on the disk without
>>>> copying the image. Image can be fragmented and shrink is done by punching holes
>>>> in the image file.
>>>>
>>>> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>> ---
>>>>   block/qcow2-cluster.c  |  40 ++++++++++++++++++
>>>>   block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>   block/qcow2.c          |  43 +++++++++++++++----
>>>>   block/qcow2.h          |  14 +++++++
>>>>   qapi/block-core.json   |   3 +-
>>>>   5 files changed, 200 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>>>> index f06c08f64c..518429c64b 100644
>>>> --- a/block/qcow2-cluster.c
>>>> +++ b/block/qcow2-cluster.c
>>>> @@ -32,6 +32,46 @@
>>>>   #include "qemu/bswap.h"
>>>>   #include "trace.h"
>>>>   
>>>> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
>>>> +{
>>>> +    BDRVQcow2State *s = bs->opaque;
>>>> +    int new_l1_size, i, ret;
>>>> +
>>>> +    if (exact_size >= s->l1_size) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    new_l1_size = exact_size;
>>>> +
>>>> +#ifdef DEBUG_ALLOC2
>>>> +    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
>>>> +#endif
>>>> +
>>>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
>>>> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
>>>> +                                       sizeof(uint64_t) * new_l1_size,
>>>> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
>>>> +    if (ret < 0) {
>>>> +        return ret;
>>>> +    }
>>>> +
>>>> +    ret = bdrv_flush(bs->file->bs);
>>>> +    if (ret < 0) {
>>>> +        return ret;
>>>> +    }
>>>
>>> If we have an error here (or after a partial bdrv_pwrite_zeroes()), we
>>> have entries zeroed out on disk, but in memory we still have the
>>> original L1 table.
>>>
>>> Should the in-memory L1 table be zeroed first? Then we can't
>>> accidentally reuse stale entries, but would have to allocate new ones,
>>> which would get on-disk state and in-memory state back in sync again.
>>
>> Yes, I thought of the same.  But this implies that the allocation is
>> able to modify the L1 table, and I find that unlikely if that
>> bdrv_flush() failed already...
>>
>> So I concluded not to have an opinion on which order is better.
> 
> Well, let me ask the other way round: Is there any disadvantage in first
> zeroing the in-memory table and then writing to the image?

If bdrv_flush/drv_pwrite_zeroes function failed, the subsequent writes
to truncated area lead to allocation L2 tables. This implies two things:

1. We need call qcow2_free_clusters() after bdrv_flush/drv_pwrite_zeroes
anyway, otherwise it may lead to the situation when the l1 table will
have two identical offsets.

2. Old l2 blocks may be lost and will be dead weight for the image.

> If I have a choice between "always safe" and "not completely safe, but I
> think it's unlikely to happen", especially in image formats, then I will
> certainly take the "always safe".

In my understanding both cases are "unsafe", because both cases may lead
to inconsistent state between image and memory. When writing this code I
was looking at an existing approach in qcow2*.c to such kind of issues:
...
static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
...
     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
     ret = qcow2_write_l1_entry(bs, l1_index);
     if (ret < 0) {
         goto fail;
     }

     *table = l2_table;
     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
     return 0;

fail:
     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
     if (l2_table != NULL) {
         qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
     }
     s->l1_table[l1_index] = old_l2_offset;
     if (l2_offset > 0) {
         qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
                             QCOW2_DISCARD_ALWAYS);
     }
     return ret;

Do I understand correctly? that if qcow2_write_l1_entr call fails
somewhere in the middle of bdrv_flush() then l1_table still contain the
old l2_offset, but the image may already contain the new l2_offset. We
can continue to use the image and write data, but in this case we lose
the data after reopening image.

So it seemed to me that in current conditions we can't escape from such
kind of problem completely. But I understand your desire to do better
even in little things, I agree that would be a little safer in the case
"first zeroing the in-memory table and then writing". So if this
solution suits all, let me write the next patch-set version :)

>>>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
>>>> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
>>>> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
>>>> +            continue;
>>>> +        }
>>>> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
>>>> +                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
>>>> +        s->l1_table[i] = 0;
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>>>                           bool exact_size)
>>>>   {
>>>
>>> I haven't checked qcow2_shrink_reftable() for similar kinds of problems,
>>> I hope Max has.
>>
>> Well, it's exactly the same there.
> 
> Ok, so I'll object to this code without really having looked at it.
> 
> Kevin
>
Pavel Butsykin July 13, 2017, 5:28 p.m. UTC | #6
On 13.07.2017 17:36, Max Reitz wrote:
> On 2017-07-13 10:41, Kevin Wolf wrote:
>> Am 12.07.2017 um 18:58 hat Max Reitz geschrieben:
>>> On 2017-07-12 16:52, Kevin Wolf wrote:
>>>> Am 12.07.2017 um 13:46 hat Pavel Butsykin geschrieben:
>>>>> This patch add shrinking of the image file for qcow2. As a result, this allows
>>>>> us to reduce the virtual image size and free up space on the disk without
>>>>> copying the image. Image can be fragmented and shrink is done by punching holes
>>>>> in the image file.
>>>>>
>>>>> Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
>>>>> Reviewed-by: Max Reitz <mreitz@redhat.com>
>>>>> ---
>>>>>   block/qcow2-cluster.c  |  40 ++++++++++++++++++
>>>>>   block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>   block/qcow2.c          |  43 +++++++++++++++----
>>>>>   block/qcow2.h          |  14 +++++++
>>>>>   qapi/block-core.json   |   3 +-
>>>>>   5 files changed, 200 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>>>>> index f06c08f64c..518429c64b 100644
>>>>> --- a/block/qcow2-cluster.c
>>>>> +++ b/block/qcow2-cluster.c
>>>>> @@ -32,6 +32,46 @@
>>>>>   #include "qemu/bswap.h"
>>>>>   #include "trace.h"
>>>>>   
>>>>> +int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
>>>>> +{
>>>>> +    BDRVQcow2State *s = bs->opaque;
>>>>> +    int new_l1_size, i, ret;
>>>>> +
>>>>> +    if (exact_size >= s->l1_size) {
>>>>> +        return 0;
>>>>> +    }
>>>>> +
>>>>> +    new_l1_size = exact_size;
>>>>> +
>>>>> +#ifdef DEBUG_ALLOC2
>>>>> +    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
>>>>> +#endif
>>>>> +
>>>>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
>>>>> +    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
>>>>> +                                       sizeof(uint64_t) * new_l1_size,
>>>>> +                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
>>>>> +    if (ret < 0) {
>>>>> +        return ret;
>>>>> +    }
>>>>> +
>>>>> +    ret = bdrv_flush(bs->file->bs);
>>>>> +    if (ret < 0) {
>>>>> +        return ret;
>>>>> +    }
>>>>
>>>> If we have an error here (or after a partial bdrv_pwrite_zeroes()), we
>>>> have entries zeroed out on disk, but in memory we still have the
>>>> original L1 table.
>>>>
>>>> Should the in-memory L1 table be zeroed first? Then we can't
>>>> accidentally reuse stale entries, but would have to allocate new ones,
>>>> which would get on-disk state and in-memory state back in sync again.
>>>
>>> Yes, I thought of the same.  But this implies that the allocation is
>>> able to modify the L1 table, and I find that unlikely if that
>>> bdrv_flush() failed already...
>>>
>>> So I concluded not to have an opinion on which order is better.
>>
>> Well, let me ask the other way round: Is there any disadvantage in first
>> zeroing the in-memory table and then writing to the image?
> 
> I was informed that the code would be harder to write. :-)
> 
>> If I have a choice between "always safe" and "not completely safe, but I
>> think it's unlikely to happen", especially in image formats, then I will
>> certainly take the "always safe".
>>
>>>>> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
>>>>> +    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
>>>>> +        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
>>>>> +            continue;
>>>>> +        }
>>>>> +        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
>>>>> +                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
>>>>> +        s->l1_table[i] = 0;
>>>>> +    }
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>>>>>                           bool exact_size)
>>>>>   {
>>>>
>>>> I haven't checked qcow2_shrink_reftable() for similar kinds of problems,
>>>> I hope Max has.
>>>
>>> Well, it's exactly the same there.
>>
>> Ok, so I'll object to this code without really having looked at it.
> I won't object to your objection. O:-)

Kevin,

Can you help me to reduce the number of patch-set versions? :)
And look at the rest part of the series, thanks!

> Max
>
Kevin Wolf July 14, 2017, 7:28 a.m. UTC | #7
Am 13.07.2017 um 19:28 hat Pavel Butsykin geschrieben:
> On 13.07.2017 17:36, Max Reitz wrote:
> >On 2017-07-13 10:41, Kevin Wolf wrote:
> >>Am 12.07.2017 um 18:58 hat Max Reitz geschrieben:
> >>>On 2017-07-12 16:52, Kevin Wolf wrote:
> >>>>Am 12.07.2017 um 13:46 hat Pavel Butsykin geschrieben:
> >>>>>This patch add shrinking of the image file for qcow2. As a result, this allows
> >>>>>us to reduce the virtual image size and free up space on the disk without
> >>>>>copying the image. Image can be fragmented and shrink is done by punching holes
> >>>>>in the image file.
> >>>>>
> >>>>>Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
> >>>>>Reviewed-by: Max Reitz <mreitz@redhat.com>
> >>>>>---
> >>>>>  block/qcow2-cluster.c  |  40 ++++++++++++++++++
> >>>>>  block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>  block/qcow2.c          |  43 +++++++++++++++----
> >>>>>  block/qcow2.h          |  14 +++++++
> >>>>>  qapi/block-core.json   |   3 +-
> >>>>>  5 files changed, 200 insertions(+), 10 deletions(-)
> >>>>>
> >>>>>diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> >>>>>index f06c08f64c..518429c64b 100644
> >>>>>--- a/block/qcow2-cluster.c
> >>>>>+++ b/block/qcow2-cluster.c
> >>>>>@@ -32,6 +32,46 @@
> >>>>>  #include "qemu/bswap.h"
> >>>>>  #include "trace.h"
> >>>>>+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
> >>>>>+{
> >>>>>+    BDRVQcow2State *s = bs->opaque;
> >>>>>+    int new_l1_size, i, ret;
> >>>>>+
> >>>>>+    if (exact_size >= s->l1_size) {
> >>>>>+        return 0;
> >>>>>+    }
> >>>>>+
> >>>>>+    new_l1_size = exact_size;
> >>>>>+
> >>>>>+#ifdef DEBUG_ALLOC2
> >>>>>+    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
> >>>>>+#endif
> >>>>>+
> >>>>>+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
> >>>>>+    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
> >>>>>+                                       sizeof(uint64_t) * new_l1_size,
> >>>>>+                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
> >>>>>+    if (ret < 0) {
> >>>>>+        return ret;
> >>>>>+    }
> >>>>>+
> >>>>>+    ret = bdrv_flush(bs->file->bs);
> >>>>>+    if (ret < 0) {
> >>>>>+        return ret;
> >>>>>+    }
> >>>>
> >>>>If we have an error here (or after a partial bdrv_pwrite_zeroes()), we
> >>>>have entries zeroed out on disk, but in memory we still have the
> >>>>original L1 table.
> >>>>
> >>>>Should the in-memory L1 table be zeroed first? Then we can't
> >>>>accidentally reuse stale entries, but would have to allocate new ones,
> >>>>which would get on-disk state and in-memory state back in sync again.
> >>>
> >>>Yes, I thought of the same.  But this implies that the allocation is
> >>>able to modify the L1 table, and I find that unlikely if that
> >>>bdrv_flush() failed already...
> >>>
> >>>So I concluded not to have an opinion on which order is better.
> >>
> >>Well, let me ask the other way round: Is there any disadvantage in first
> >>zeroing the in-memory table and then writing to the image?
> >
> >I was informed that the code would be harder to write. :-)
> >
> >>If I have a choice between "always safe" and "not completely safe, but I
> >>think it's unlikely to happen", especially in image formats, then I will
> >>certainly take the "always safe".
> >>
> >>>>>+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
> >>>>>+    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
> >>>>>+        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
> >>>>>+            continue;
> >>>>>+        }
> >>>>>+        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
> >>>>>+                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
> >>>>>+        s->l1_table[i] = 0;
> >>>>>+    }
> >>>>>+    return 0;
> >>>>>+}
> >>>>>+
> >>>>>  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
> >>>>>                          bool exact_size)
> >>>>>  {
> >>>>
> >>>>I haven't checked qcow2_shrink_reftable() for similar kinds of problems,
> >>>>I hope Max has.
> >>>
> >>>Well, it's exactly the same there.
> >>
> >>Ok, so I'll object to this code without really having looked at it.
> >I won't object to your objection. O:-)
> 
> Kevin,
> 
> Can you help me to reduce the number of patch-set versions? :)
> And look at the rest part of the series, thanks!

Patches 1 and 2 looked good to me. I didn't have a look at the test
case, but if it passes and Max reviewed it, it can't be too bad. ;-)

Kevin
Kevin Wolf July 14, 2017, 7:44 a.m. UTC | #8
Am 13.07.2017 um 19:18 hat Pavel Butsykin geschrieben:
> On 13.07.2017 11:41, Kevin Wolf wrote:
> >Am 12.07.2017 um 18:58 hat Max Reitz geschrieben:
> >>On 2017-07-12 16:52, Kevin Wolf wrote:
> >>>Am 12.07.2017 um 13:46 hat Pavel Butsykin geschrieben:
> >>>>This patch add shrinking of the image file for qcow2. As a result, this allows
> >>>>us to reduce the virtual image size and free up space on the disk without
> >>>>copying the image. Image can be fragmented and shrink is done by punching holes
> >>>>in the image file.
> >>>>
> >>>>Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
> >>>>Reviewed-by: Max Reitz <mreitz@redhat.com>
> >>>>---
> >>>>  block/qcow2-cluster.c  |  40 ++++++++++++++++++
> >>>>  block/qcow2-refcount.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  block/qcow2.c          |  43 +++++++++++++++----
> >>>>  block/qcow2.h          |  14 +++++++
> >>>>  qapi/block-core.json   |   3 +-
> >>>>  5 files changed, 200 insertions(+), 10 deletions(-)
> >>>>
> >>>>diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> >>>>index f06c08f64c..518429c64b 100644
> >>>>--- a/block/qcow2-cluster.c
> >>>>+++ b/block/qcow2-cluster.c
> >>>>@@ -32,6 +32,46 @@
> >>>>  #include "qemu/bswap.h"
> >>>>  #include "trace.h"
> >>>>+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
> >>>>+{
> >>>>+    BDRVQcow2State *s = bs->opaque;
> >>>>+    int new_l1_size, i, ret;
> >>>>+
> >>>>+    if (exact_size >= s->l1_size) {
> >>>>+        return 0;
> >>>>+    }
> >>>>+
> >>>>+    new_l1_size = exact_size;
> >>>>+
> >>>>+#ifdef DEBUG_ALLOC2
> >>>>+    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
> >>>>+#endif
> >>>>+
> >>>>+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
> >>>>+    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
> >>>>+                                       sizeof(uint64_t) * new_l1_size,
> >>>>+                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
> >>>>+    if (ret < 0) {
> >>>>+        return ret;
> >>>>+    }
> >>>>+
> >>>>+    ret = bdrv_flush(bs->file->bs);
> >>>>+    if (ret < 0) {
> >>>>+        return ret;
> >>>>+    }
> >>>
> >>>If we have an error here (or after a partial bdrv_pwrite_zeroes()), we
> >>>have entries zeroed out on disk, but in memory we still have the
> >>>original L1 table.
> >>>
> >>>Should the in-memory L1 table be zeroed first? Then we can't
> >>>accidentally reuse stale entries, but would have to allocate new ones,
> >>>which would get on-disk state and in-memory state back in sync again.
> >>
> >>Yes, I thought of the same.  But this implies that the allocation is
> >>able to modify the L1 table, and I find that unlikely if that
> >>bdrv_flush() failed already...
> >>
> >>So I concluded not to have an opinion on which order is better.
> >
> >Well, let me ask the other way round: Is there any disadvantage in first
> >zeroing the in-memory table and then writing to the image?
> 
> If bdrv_flush/drv_pwrite_zeroes function failed, the subsequent writes
> to truncated area lead to allocation L2 tables. This implies two things:
> 
> 1. We need call qcow2_free_clusters() after bdrv_flush/drv_pwrite_zeroes
> anyway, otherwise it may lead to the situation when the l1 table will
> have two identical offsets.

Yes, I'm only talking about zeroing the in-memory L1 table first, not
moving anything else.

> 2. Old l2 blocks may be lost and will be dead weight for the image.

Yes, without a journal, we can't avoid leaked clusters in certain error
cases. They are harmless, though, and 'qemu-img check' can detect and
repair them without any risk.

Leaked clusters are always better than actual image corruption, which is
what would potentially happen with your current order of operations.

> >If I have a choice between "always safe" and "not completely safe, but I
> >think it's unlikely to happen", especially in image formats, then I will
> >certainly take the "always safe".
> 
> In my understanding both cases are "unsafe", because both cases may lead
> to inconsistent state between image and memory.

Yes, we cannot guarantee full consistency of the image, but that doesn't
make it unsafe yet. I would only call things unsafe if we may end up
corrupting user data. This is the most important thing for an image
format to avoid. If at all possible, we need to write our code so that
no matter what happens, the worst thing we get is leaked clusters.

> When writing this code I was looking at an existing approach in
> qcow2*.c to such kind of issues:
> ...
> static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
> ...
>     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
>     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
>     ret = qcow2_write_l1_entry(bs, l1_index);
>     if (ret < 0) {
>         goto fail;
>     }
> 
>     *table = l2_table;
>     trace_qcow2_l2_allocate_done(bs, l1_index, 0);
>     return 0;
> 
> fail:
>     trace_qcow2_l2_allocate_done(bs, l1_index, ret);
>     if (l2_table != NULL) {
>         qcow2_cache_put(bs, s->l2_table_cache, (void**) table);
>     }
>     s->l1_table[l1_index] = old_l2_offset;
>     if (l2_offset > 0) {
>         qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t),
>                             QCOW2_DISCARD_ALWAYS);
>     }
>     return ret;
> 
> Do I understand correctly? that if qcow2_write_l1_entr call fails
> somewhere in the middle of bdrv_flush() then l1_table still contain the
> old l2_offset, but the image may already contain the new l2_offset. We
> can continue to use the image and write data, but in this case we lose
> the data after reopening image.

You're right, if the error happens somewhere in the middle, we have a
serious problem there. :-(

I think the only way out in this case would be to write the new data at
a different position in the image file than the old data. (Do I hear
anyone say "journal"?)

> So it seemed to me that in current conditions we can't escape from such
> kind of problem completely. But I understand your desire to do better
> even in little things, I agree that would be a little safer in the case
> "first zeroing the in-memory table and then writing". So if this
> solution suits all, let me write the next patch-set version :)

Yay! :-)

Kevin
diff mbox

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index f06c08f64c..518429c64b 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -32,6 +32,46 @@ 
 #include "qemu/bswap.h"
 #include "trace.h"
 
+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t exact_size)
+{
+    BDRVQcow2State *s = bs->opaque;
+    int new_l1_size, i, ret;
+
+    if (exact_size >= s->l1_size) {
+        return 0;
+    }
+
+    new_l1_size = exact_size;
+
+#ifdef DEBUG_ALLOC2
+    fprintf(stderr, "shrink l1_table from %d to %d\n", s->l1_size, new_l1_size);
+#endif
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_WRITE_TABLE);
+    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset +
+                                       sizeof(uint64_t) * new_l1_size,
+                             (s->l1_size - new_l1_size) * sizeof(uint64_t), 0);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_flush(bs->file->bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_SHRINK_FREE_L2_CLUSTERS);
+    for (i = s->l1_size - 1; i > new_l1_size - 1; i--) {
+        if ((s->l1_table[i] & L1E_OFFSET_MASK) == 0) {
+            continue;
+        }
+        qcow2_free_clusters(bs, s->l1_table[i] & L1E_OFFSET_MASK,
+                            s->cluster_size, QCOW2_DISCARD_ALWAYS);
+        s->l1_table[i] = 0;
+    }
+    return 0;
+}
+
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size)
 {
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 8050db4544..5c8d606d29 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -29,6 +29,7 @@ 
 #include "block/qcow2.h"
 #include "qemu/range.h"
 #include "qemu/bswap.h"
+#include "qemu/cutils.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
@@ -3059,3 +3060,112 @@  done:
     qemu_vfree(new_refblock);
     return ret;
 }
+
+int qcow2_shrink_reftable(BlockDriverState *bs)
+{
+    BDRVQcow2State *s = bs->opaque;
+    uint64_t *reftable_tmp =
+        g_try_malloc(sizeof(uint64_t) * s->refcount_table_size);
+    int i, ret;
+
+    if (s->refcount_table_size && reftable_tmp == NULL) {
+        return -ENOMEM;
+    }
+
+    for (i = 0; i < s->refcount_table_size; i++) {
+        int64_t refblock_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
+        void *refblock;
+        bool unused_block;
+
+        if (refblock_offs == 0) {
+            reftable_tmp[i] = 0;
+            continue;
+        }
+        ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
+                              &refblock);
+        if (ret < 0) {
+            goto out;
+        }
+
+        /* the refblock has own reference */
+        if (i == offset_to_reftable_index(s, refblock_offs)) {
+            uint64_t block_index = (refblock_offs >> s->cluster_bits) &
+                                   (s->refcount_block_size - 1);
+            uint64_t refcount = s->get_refcount(refblock, block_index);
+
+            s->set_refcount(refblock, block_index, 0);
+
+            unused_block = buffer_is_zero(refblock, s->cluster_size);
+
+            s->set_refcount(refblock, block_index, refcount);
+        } else {
+            unused_block = buffer_is_zero(refblock, s->cluster_size);
+        }
+        qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+
+        reftable_tmp[i] = unused_block ? 0 : cpu_to_be64(s->refcount_table[i]);
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset, reftable_tmp,
+                           sizeof(uint64_t) * s->refcount_table_size);
+    if (ret < 0) {
+        goto out;
+    }
+
+    for (i = 0; i < s->refcount_table_size; i++) {
+        if (s->refcount_table[i] && !reftable_tmp[i]) {
+            uint64_t discard_offs = s->refcount_table[i] & REFT_OFFSET_MASK;
+            uint64_t refblock_offs = get_refblock_offset(s, discard_offs);
+            uint64_t cluster_index = discard_offs >> s->cluster_bits;
+            uint32_t block_index = cluster_index & (s->refcount_block_size - 1);
+            void *refblock;
+
+            assert(discard_offs != 0);
+
+            ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offs,
+                                  &refblock);
+            if (ret < 0) {
+                goto out;
+            }
+
+            if (s->get_refcount(refblock, block_index) != 1) {
+                qcow2_signal_corruption(bs, true, -1, -1, "Invalid refcount:"
+                                        " refblock offset %#" PRIx64
+                                        ", reftable index %d"
+                                        ", block offset %#" PRIx64
+                                        ", refcount %#" PRIx64,
+                                        refblock_offs, i, discard_offs,
+                                        s->get_refcount(refblock, block_index));
+                qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+                ret = -EINVAL;
+                goto out;
+            }
+            s->set_refcount(refblock, block_index, 0);
+
+            qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, refblock);
+
+            qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
+
+            if (cluster_index < s->free_cluster_index) {
+                s->free_cluster_index = cluster_index;
+            }
+
+            refblock = qcow2_cache_is_table_offset(bs, s->refcount_block_cache,
+                                                   discard_offs);
+            if (refblock) {
+                /* discard refblock from the cache if refblock is cached */
+                qcow2_cache_discard(bs, s->refcount_block_cache, refblock);
+            }
+            update_refcount_discard(bs, discard_offs, s->cluster_size);
+            s->refcount_table[i] = 0;
+        }
+    }
+
+    if (!s->cache_discards) {
+        qcow2_process_discards(bs, ret);
+    }
+
+out:
+    g_free(reftable_tmp);
+    return ret;
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index c144ea5620..bd281fdd04 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3120,18 +3120,43 @@  static int qcow2_truncate(BlockDriverState *bs, int64_t offset,
     }
 
     old_length = bs->total_sectors * 512;
+    new_l1_size = size_to_l1(s, offset);
 
-    /* shrinking is currently not supported */
     if (offset < old_length) {
-        error_setg(errp, "qcow2 doesn't support shrinking images yet");
-        return -ENOTSUP;
-    }
+        if (prealloc != PREALLOC_MODE_OFF) {
+            error_setg(errp,
+                       "Preallocation can't be used for shrinking an image");
+            return -EINVAL;
+        }
 
-    new_l1_size = size_to_l1(s, offset);
-    ret = qcow2_grow_l1_table(bs, new_l1_size, true);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "Failed to grow the L1 table");
-        return ret;
+        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
+                                    old_length - ROUND_UP(offset,
+                                                          s->cluster_size),
+                                    QCOW2_DISCARD_ALWAYS, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
+            return ret;
+        }
+
+        ret = qcow2_shrink_l1_table(bs, new_l1_size);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to reduce the number of L2 tables");
+            return ret;
+        }
+
+        ret = qcow2_shrink_reftable(bs);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret,
+                             "Failed to discard unused refblocks");
+            return ret;
+        }
+    } else {
+        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
+        if (ret < 0) {
+            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
+            return ret;
+        }
     }
 
     switch (prealloc) {
diff --git a/block/qcow2.h b/block/qcow2.h
index 52c374e9ed..5a289a81e2 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -521,6 +521,18 @@  static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
     return r1 > r2 ? r1 - r2 : r2 - r1;
 }
 
+static inline
+uint32_t offset_to_reftable_index(BDRVQcow2State *s, uint64_t offset)
+{
+    return offset >> (s->refcount_block_bits + s->cluster_bits);
+}
+
+static inline uint64_t get_refblock_offset(BDRVQcow2State *s, uint64_t offset)
+{
+    uint32_t index = offset_to_reftable_index(s, offset);
+    return s->refcount_table[index] & REFT_OFFSET_MASK;
+}
+
 /* qcow2.c functions */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                   int64_t sector_num, int nb_sectors);
@@ -584,10 +596,12 @@  int qcow2_inc_refcounts_imrt(BlockDriverState *bs, BdrvCheckResult *res,
 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
                                 BlockDriverAmendStatusCB *status_cb,
                                 void *cb_opaque, Error **errp);
+int qcow2_shrink_reftable(BlockDriverState *bs);
 
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
+int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
diff --git a/qapi/block-core.json b/qapi/block-core.json
index c437aa50ef..99cef55b7c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2487,7 +2487,8 @@ 
             'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
             'flush_to_disk', 'pwritev_rmw_head', 'pwritev_rmw_after_head',
             'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev',
-            'pwritev_zero', 'pwritev_done', 'empty_image_prepare' ] }
+            'pwritev_zero', 'pwritev_done', 'empty_image_prepare',
+            'l1_shrink_write_table', 'l1_shrink_free_l2_clusters' ] }
 
 ##
 # @BlkdebugInjectErrorOptions: