diff mbox

[v5,1/3] qcow2: Add qcow2_shrink_l1_and_l2_table for qcow2 shrinking

Message ID 1414336849-21179-2-git-send-email-junmuzi@gmail.com
State New
Headers show

Commit Message

lijun Oct. 26, 2014, 3:20 p.m. UTC
This patch is the realization of new function qcow2_shrink_l1_and_l2_table.
This function will shrink/discard l1 and l2 table when do qcow2 shrinking.

Signed-off-by: Jun Li <junmuzi@gmail.com>
---
v5:
  Do some modifications based on MAX's suggestion. Thanks for MAX.
  In v5, do l2 shrinking firstly, then do l1 shrinking in function qcow2_shrink_l1_and_l2_table. As do l1 shrinking need to allocate some clusters for new l1 table, so in v5 it can re-use the freed clusters come from l2 shrinking.

v4:
  Add deal with COW clusters in l2 table. When using COW, some of (l2_entry >>
s->cluster_bits) will larger than s->refcount_table_size, so need to discard
this l2_entry.

v3:
  Fixed host cluster leak.
---
 block/qcow2-cluster.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.c         |  37 +++++++++-
 block/qcow2.h         |   2 +
 3 files changed, 218 insertions(+), 3 deletions(-)

Comments

Max Reitz Nov. 21, 2014, 10:56 a.m. UTC | #1
On 2014-10-26 at 16:20, Jun Li wrote:
> This patch is the realization of new function qcow2_shrink_l1_and_l2_table.
> This function will shrink/discard l1 and l2 table when do qcow2 shrinking.
>
> Signed-off-by: Jun Li <junmuzi@gmail.com>
> ---
> v5:
>    Do some modifications based on MAX's suggestion. Thanks for MAX.
>    In v5, do l2 shrinking firstly, then do l1 shrinking in function qcow2_shrink_l1_and_l2_table. As do l1 shrinking need to allocate some clusters for new l1 table, so in v5 it can re-use the freed clusters come from l2 shrinking.
>
> v4:
>    Add deal with COW clusters in l2 table. When using COW, some of (l2_entry >>
> s->cluster_bits) will larger than s->refcount_table_size, so need to discard
> this l2_entry.
>
> v3:
>    Fixed host cluster leak.
> ---
>   block/qcow2-cluster.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   block/qcow2.c         |  37 +++++++++-
>   block/qcow2.h         |   2 +
>   3 files changed, 218 insertions(+), 3 deletions(-)
>
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 4d888c7..28d2d62 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -29,6 +29,9 @@
>   #include "block/qcow2.h"
>   #include "trace.h"
>   
> +static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
> +                   uint64_t **l2_table);
> +
>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>                           bool exact_size)
>   {
> @@ -135,6 +138,185 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>       return ret;
>   }
>   
> +int qcow2_shrink_l1_and_l2_table(BlockDriverState *bs, uint64_t new_l1_size,
> +                                 int new_l2_index, int64_t boundary_size)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    int new_l1_size2, ret, i;

gcc gives me a warning (which is made an error by -Werror) here due to 
uninitialized use of "ret". "ret" is not necessarily set in the while () 
loop and there are some "goto fail"s afterwards which do not set it.

> +    uint64_t *new_l1_table;
> +    int64_t new_l1_table_offset;
> +    int64_t old_l1_table_offset, old_l1_size;
> +    uint8_t data[12];
> +    uint64_t l2_offset;
> +    uint64_t *l2_table, l2_entry;
> +    int64_t l2_free_entry; /* The entry of l2 table need to free from */
> +    uint64_t *old_l1_table = s->l1_table;
> +    int num = s->l1_size - new_l1_size;
> +
> +    assert(new_l1_size <= s->l1_size);
> +    while ((num >= -1) && (s->l1_size + num - 1 >= 0)) {
> +        l2_free_entry = 0;
> +        l2_offset = old_l1_table[s->l1_size + num - 1] & L1E_OFFSET_MASK;
> +
> +        if (l2_offset == 0) {
> +            goto retry;
> +        }
> +
> +        if (num == 0) {
> +            if (new_l2_index == 0) {
> +                goto retry;
> +            }
> +            l2_free_entry = new_l2_index;
> +        }
> +
> +        /* load l2_table into cache */
> +        ret = l2_load(bs, l2_offset, &l2_table);
> +
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        for (i = s->l2_size - 1; i >= 0; i--) {
> +            l2_entry = be64_to_cpu(l2_table[i]);

& L2E_OFFSET_MASK is missing. OFLAG_COPIED will break this without.

> +
> +            /* Due to COW, the clusters in l2 table will
> +             * not in sequential order, so there will be
> +             * some l2_entry >= boundary_size when perform shrinking.
> +             */
> +            if (num == -1) {
> +                if (l2_entry >= boundary_size) {

boundary_size is set to "offset" by the caller. "offset" is the guest 
disk size. l2_entry (or should be) a host offset. They are not comparable.

> +                    goto free_cluster;
> +                } else {
> +                    continue;
> +                }
> +            }
> +
> +            /* Deal with COW clusters in l2 table when num == 0 */
> +            if (i <= l2_free_entry - 1) {
> +                if (l2_entry >= boundary_size) {
> +                    goto free_cluster;
> +                }
> +                continue;
> +            }
> +
> +            switch (qcow2_get_cluster_type(l2_entry)) {
> +            case QCOW2_CLUSTER_UNALLOCATED:
> +                if (!bs->backing_hd) {
> +                    continue;
> +                }
> +                break;
> +
> +            case QCOW2_CLUSTER_ZERO:
> +                continue;
> +
> +            case QCOW2_CLUSTER_NORMAL:
> +            case QCOW2_CLUSTER_COMPRESSED:
> +                break;
> +
> +            default:
> +                abort();
> +            }
> +
> +        free_cluster:
> +            qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
> +
> +            if (s->qcow_version >= 3) {
> +                l2_table[i] = cpu_to_be64(QCOW_OFLAG_ZERO);
> +            } else {
> +                l2_table[i] = cpu_to_be64(0);
> +            }
> +
> +            /* Then decrease the refcount */
> +            qcow2_free_any_clusters(bs, l2_entry, 1, QCOW2_DISCARD_MAX);
> +        }
> +
> +        ret = qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        if (l2_free_entry == 0 && num != -1) {
> +            qemu_vfree(l2_table);
> +            qcow2_free_clusters(bs, l2_offset, s->cluster_size - 1,
> +                                QCOW2_DISCARD_OTHER);

You should probably flush the L2 cache before you do that.

> +        }
> +    retry:
> +        num--;
> +    }
> +
> +    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
> +    new_l1_table = qemu_try_blockalign(bs->file,
> +                                       align_offset(new_l1_size2, 512));
> +    if (new_l1_table == NULL) {
> +        return -ENOMEM;
> +    }
> +    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
> +
> +    /* shrinking l1 table */
> +    memcpy(new_l1_table, s->l1_table, new_l1_size2);
> +
> +    /* write new table (align to cluster) */
> +    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
> +
> +    if ((new_l1_table_offset) >= boundary_size) {
> +        goto fail;
> +    }
> +
> +    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
> +    if (ret < 0) {
> +        goto fail;
> +    }

You don't need to flush the refblock cache.

> +
> +    /* the L1 position has not yet been updated, so these clusters must
> +     * indeed be completely free */
> +    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
> +                                        new_l1_size2);
> +
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
> +
> +    for (i = 0; i < new_l1_size; i++) {
> +        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
> +    }
> +
> +    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
> +                           new_l1_table, new_l1_size2);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    for (i = 0; i < new_l1_size; i++) {
> +        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
> +    }
> +
> +    /* set new table */
> +    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
> +    cpu_to_be32w((uint32_t *)data, new_l1_size);
> +    stq_be_p(data + 4, new_l1_table_offset);
> +    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
> +                           data, sizeof(data));
> +    if (ret < 0) {
> +        goto fail;
> +    }

I understand your intent with writing a new L1 table. But I don't see 
the need. With 64 kB clusters, a 64 kB L1 table can serve an image with 
4 TB guest size (8192 L2 tables, each having 8192 entries). I don't 
think we'll gain a lot from shrinking the L1 table. It's too much effort.

> +
> +    qemu_vfree(s->l1_table);
> +    old_l1_table_offset = s->l1_table_offset;
> +    s->l1_table_offset = new_l1_table_offset;
> +    s->l1_table = new_l1_table;
> +    old_l1_size = s->l1_size;
> +    s->l1_size = new_l1_size;
> +    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
> +                        QCOW2_DISCARD_OTHER);
> +    return 0;
> + fail:
> +    qemu_vfree(new_l1_table);
> +    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
> +                        QCOW2_DISCARD_OTHER);
> +    return ret;
> +}
> +
>   /*
>    * l2_load
>    *
> diff --git a/block/qcow2.c b/block/qcow2.c
> index d031515..d2b0dfe 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2111,10 +2111,41 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
>           return -ENOTSUP;
>       }
>   
> -    /* shrinking is currently not supported */
> +    /* shrinking image */
>       if (offset < bs->total_sectors * 512) {
> -        error_report("qcow2 doesn't support shrinking images yet");
> -        return -ENOTSUP;
> +        /* As l1 table, l2 table, refcount table, refcount block table
> +         * and file header of the qcow2 image need to use some clusters,
> +         * so should subtract these metadata from offset.
> +         */
> +        int64_t nb_l1 = DIV_ROUND_UP((uint64_t)s->l1_size * sizeof(uint64_t),
> +                                     s->cluster_size);
> +        int64_t nb_l2 = DIV_ROUND_UP(offset, (uint64_t)s->l2_size <<
> +                                     s->cluster_bits);
> +        int64_t nb_refcount_block_table = DIV_ROUND_UP(offset, (uint64_t)
> +                                                       s->cluster_size <<
> +                                                       s->refcount_block_bits);
> +        int64_t nb_refcount_table = DIV_ROUND_UP(nb_refcount_block_table << 3,
> +                                                 s->cluster_size);
> +        int64_t total_nb = 2 * nb_l2 + nb_l1 + nb_refcount_block_table +
> +                           nb_refcount_table + 1;
> +        int64_t offset_for_shrink = offset - (total_nb << s->cluster_bits);

Okay, what does total_nb have to do with offset?

total_nb is some host value. offset is a guest value. Don't mix them.

> +        int new_l2_index = offset_to_l2_index(s, offset_for_shrink);
> +
> +        new_l1_size = size_to_l1(s, offset_for_shrink);
> +        ret = qcow2_shrink_l1_and_l2_table(bs, new_l1_size, new_l2_index,
> +                                           offset);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +
> +        int64_t actual_size = bdrv_get_allocated_file_size(bs);
> +
> +        if (offset < actual_size) {
> +            ret = bdrv_truncate(bs->file, offset);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }

actual_size is not some offset you can use. It's the number of bytes 
allocated in the file. The file can contain holes. Therefore, comparing 
an offset against actual_size is wrong. Second, offset is the guest disk 
size, actual_size is the host disk size. They are not comparable.

Second, why are you truncating the file to offset if offset is smaller 
than actual_size? That is always wrong. You are blindly assuming:
(1) that the image consists of only data and no metadata, because you're 
using the guest disk size (offset) to shrink the host file
(2) that all that data is tightly packed at the beginning of the file

(1) is obviously wrong. (2) is wrong as well, because clusters may be 
spread all over the host file. We would need defragmentation to solve 
that issue.

Therefore, to shorten the host file, you'd need to walk through the 
image and find the highest *host* cluster actually in use. Then you can 
truncate to after that cluster. However, I'd strongly advise against 
that because it doesn't bring any actual benefit.

Today's file systems support holes. Just discard all the clusters which 
get freed during shrinking, that will free up all that space. No need to 
defragment, no need to truncate. Yes, the file will look larger than it 
actually is, but nobody should care. We will need defragmentation to 
tackle that issue.

>       }
>   
>       new_l1_size = size_to_l1(s, offset);
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 577ccd1..be1237d 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -516,6 +516,8 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
>   /* qcow2-cluster.c functions */
>   int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
>                           bool exact_size);
> +int qcow2_shrink_l1_and_l2_table(BlockDriverState *bs, uint64_t new_l1_size,
> +                                 int new_l2_index, int64_t boundary_size);
>   int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
>   void qcow2_l2_cache_reset(BlockDriverState *bs);
>   int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);

So, as for what I think we do need to do when shrinking (and keep in 
mind: The offset given to qcow2_truncate() is the guest size! NOT the 
host image size!):

(1) Determine the first L2 table and the first entry in the table which 
will lie beyond the new guest disk size.
(2) Discard all clusters beginning from there.
(3) Discard all L2 tables which are then completely empty.
(4) Update the header size.

And that's it. We can either speed up step (2) by implementing it 
manually, or we just use bdrv_discard() on the qcow2 BDS (in the 
simplest case: bdrv_discard(bs, DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE), 
bs->total_sectors - DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE));.

We can incorporate step (3) by extending qcow2_discard_clusters() to 
free L2 tables when they are empty after discard_single_l2(). But we 
don't even have to that now. It's an optimization we can go about later.

So, we can do (1), (2) and (3) in a single step: Just one bdrv_discard() 
call. But it's probably better to use qcow2_discard_clusters() instead 
and set the full_discard parameter to true.

So: qcow2_discard_clusters(bs, offset, bs->total_sectors - offset / 
BDRV_SECTOR_SIZE, true);. Then update the guest disk size field in the 
header. And we're done.

There are four problems with this approach:
- qcow2_discard_clusters() might be slower than optimal. I personally 
don't care at all.
- If "bs->total_sectors * BDRV_SECTOR_SIZE - offset" is greater than 
INT_MAX, this won't work. Trivially solvable by encapsulating the 
qcow2_discard_clusters() call in a loop which limits nb_clusters to 
INT_MAX / BDRV_SECTOR_SIZE.
- The L1 table is not resized. Should not matter in practice at all.
- The file is not truncated. Does not matter either (because holes in 
the file are good enough), and we can't actually solve this problem 
without defragmentation anyway.

There is one advantage:
- It's extremely simple. It's literally below ten lines of code.

I think the advantage far outweighs the disadvantage. But I may be 
wrong. What do you think?

Max
Eric Blake Nov. 24, 2014, 5:49 p.m. UTC | #2
On 11/21/2014 03:56 AM, Max Reitz wrote:
> 
> Second, why are you truncating the file to offset if offset is smaller
> than actual_size? That is always wrong. You are blindly assuming:
> (1) that the image consists of only data and no metadata, because you're
> using the guest disk size (offset) to shrink the host file
> (2) that all that data is tightly packed at the beginning of the file
> 
> (1) is obviously wrong. (2) is wrong as well, because clusters may be
> spread all over the host file. We would need defragmentation to solve
> that issue.
> 
> Therefore, to shorten the host file, you'd need to walk through the
> image and find the highest *host* cluster actually in use. Then you can
> truncate to after that cluster. However, I'd strongly advise against
> that because it doesn't bring any actual benefit.

Well, there is a minor benefit - the 'query-blockstats' QMP command
reports wr_highest_offset, which IS the offset of the highest host
cluster already in use, but right now, we only populate that field on
the first allocating write, whereas I would like to be able to get at
that information even for a read-only file.  See also Max's thread for
optimizing overlap checks, where I mention this same thing as a
side-effect we would get for free when improving overlap checks.

> So, as for what I think we do need to do when shrinking (and keep in
> mind: The offset given to qcow2_truncate() is the guest size! NOT the
> host image size!):
> 
> (1) Determine the first L2 table and the first entry in the table which
> will lie beyond the new guest disk size.
> (2) Discard all clusters beginning from there.
> (3) Discard all L2 tables which are then completely empty.

You may want to also consider discarding refblocks when completely
empty, and truncating the size of the reftable if enough trailing
refblocks are dropped.  On the other hand, just as Max argued that
shrinking the L1 table is going to be in the noise, you are also going
to be in the noise for trying to shrink the reftable.

> (4) Update the header size.
> 
> And that's it. We can either speed up step (2) by implementing it
> manually, or we just use bdrv_discard() on the qcow2 BDS (in the
> simplest case: bdrv_discard(bs, DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE),
> bs->total_sectors - DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE));.
> 
> We can incorporate step (3) by extending qcow2_discard_clusters() to
> free L2 tables when they are empty after discard_single_l2(). But we
> don't even have to that now. It's an optimization we can go about later.

Same for trimming unused refblocks and/or shrinking the reftable.  Also,
I think that a future addition of a defragmentation pass would be a more
ideal place for tightly packing an image, including trimming reftables
to a bare minimum.

> 
> There is one advantage:
> - It's extremely simple. It's literally below ten lines of code.
> 
> I think the advantage far outweighs the disadvantage. But I may be
> wrong. What do you think?

I also like the much simpler approach of leaving holes.
Jun Li Jan. 3, 2015, 12:23 p.m. UTC | #3
On Fri, 11/21 11:56, Max Reitz wrote:
> On 2014-10-26 at 16:20, Jun Li wrote:
> >This patch is the realization of new function qcow2_shrink_l1_and_l2_table.
> >This function will shrink/discard l1 and l2 table when do qcow2 shrinking.
> >
> >Signed-off-by: Jun Li <junmuzi@gmail.com>
> >---
> >v5:
> >   Do some modifications based on MAX's suggestion. Thanks for MAX.
> >   In v5, do l2 shrinking firstly, then do l1 shrinking in function qcow2_shrink_l1_and_l2_table. As do l1 shrinking need to allocate some clusters for new l1 table, so in v5 it can re-use the freed clusters come from l2 shrinking.
> >
> >v4:
> >   Add deal with COW clusters in l2 table. When using COW, some of (l2_entry >>
> >s->cluster_bits) will larger than s->refcount_table_size, so need to discard
> >this l2_entry.
> >
> >v3:
> >   Fixed host cluster leak.
> >---
> >  block/qcow2-cluster.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  block/qcow2.c         |  37 +++++++++-
> >  block/qcow2.h         |   2 +
> >  3 files changed, 218 insertions(+), 3 deletions(-)
> >
> >diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> >index 4d888c7..28d2d62 100644
> >--- a/block/qcow2-cluster.c
> >+++ b/block/qcow2-cluster.c
> >@@ -29,6 +29,9 @@
> >  #include "block/qcow2.h"
> >  #include "trace.h"
> >+static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
> >+                   uint64_t **l2_table);
> >+
> >  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
> >                          bool exact_size)
> >  {
> >@@ -135,6 +138,185 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
> >      return ret;
> >  }
> >+int qcow2_shrink_l1_and_l2_table(BlockDriverState *bs, uint64_t new_l1_size,
> >+                                 int new_l2_index, int64_t boundary_size)
> >+{
> >+    BDRVQcowState *s = bs->opaque;
> >+    int new_l1_size2, ret, i;
> 
> gcc gives me a warning (which is made an error by -Werror) here due to
> uninitialized use of "ret". "ret" is not necessarily set in the while ()
> loop and there are some "goto fail"s afterwards which do not set it.
> 
> >+    uint64_t *new_l1_table;
> >+    int64_t new_l1_table_offset;
> >+    int64_t old_l1_table_offset, old_l1_size;
> >+    uint8_t data[12];
> >+    uint64_t l2_offset;
> >+    uint64_t *l2_table, l2_entry;
> >+    int64_t l2_free_entry; /* The entry of l2 table need to free from */
> >+    uint64_t *old_l1_table = s->l1_table;
> >+    int num = s->l1_size - new_l1_size;
> >+
> >+    assert(new_l1_size <= s->l1_size);
> >+    while ((num >= -1) && (s->l1_size + num - 1 >= 0)) {
> >+        l2_free_entry = 0;
> >+        l2_offset = old_l1_table[s->l1_size + num - 1] & L1E_OFFSET_MASK;
> >+
> >+        if (l2_offset == 0) {
> >+            goto retry;
> >+        }
> >+
> >+        if (num == 0) {
> >+            if (new_l2_index == 0) {
> >+                goto retry;
> >+            }
> >+            l2_free_entry = new_l2_index;
> >+        }
> >+
> >+        /* load l2_table into cache */
> >+        ret = l2_load(bs, l2_offset, &l2_table);
> >+
> >+        if (ret < 0) {
> >+            return ret;
> >+        }
> >+
> >+        for (i = s->l2_size - 1; i >= 0; i--) {
> >+            l2_entry = be64_to_cpu(l2_table[i]);
> 
> & L2E_OFFSET_MASK is missing. OFLAG_COPIED will break this without.
> 
> >+
> >+            /* Due to COW, the clusters in l2 table will
> >+             * not in sequential order, so there will be
> >+             * some l2_entry >= boundary_size when perform shrinking.
> >+             */
> >+            if (num == -1) {
> >+                if (l2_entry >= boundary_size) {
> 
> boundary_size is set to "offset" by the caller. "offset" is the guest disk
> size. l2_entry (or should be) a host offset. They are not comparable.
> 
> >+                    goto free_cluster;
> >+                } else {
> >+                    continue;
> >+                }
> >+            }
> >+
> >+            /* Deal with COW clusters in l2 table when num == 0 */
> >+            if (i <= l2_free_entry - 1) {
> >+                if (l2_entry >= boundary_size) {
> >+                    goto free_cluster;
> >+                }
> >+                continue;
> >+            }
> >+
> >+            switch (qcow2_get_cluster_type(l2_entry)) {
> >+            case QCOW2_CLUSTER_UNALLOCATED:
> >+                if (!bs->backing_hd) {
> >+                    continue;
> >+                }
> >+                break;
> >+
> >+            case QCOW2_CLUSTER_ZERO:
> >+                continue;
> >+
> >+            case QCOW2_CLUSTER_NORMAL:
> >+            case QCOW2_CLUSTER_COMPRESSED:
> >+                break;
> >+
> >+            default:
> >+                abort();
> >+            }
> >+
> >+        free_cluster:
> >+            qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
> >+
> >+            if (s->qcow_version >= 3) {
> >+                l2_table[i] = cpu_to_be64(QCOW_OFLAG_ZERO);
> >+            } else {
> >+                l2_table[i] = cpu_to_be64(0);
> >+            }
> >+
> >+            /* Then decrease the refcount */
> >+            qcow2_free_any_clusters(bs, l2_entry, 1, QCOW2_DISCARD_MAX);
> >+        }
> >+
> >+        ret = qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
> >+        if (ret < 0) {
> >+            return ret;
> >+        }
> >+        if (l2_free_entry == 0 && num != -1) {
> >+            qemu_vfree(l2_table);
> >+            qcow2_free_clusters(bs, l2_offset, s->cluster_size - 1,
> >+                                QCOW2_DISCARD_OTHER);
> 
> You should probably flush the L2 cache before you do that.
> 
> >+        }
> >+    retry:
> >+        num--;
> >+    }
> >+
> >+    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
> >+    new_l1_table = qemu_try_blockalign(bs->file,
> >+                                       align_offset(new_l1_size2, 512));
> >+    if (new_l1_table == NULL) {
> >+        return -ENOMEM;
> >+    }
> >+    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
> >+
> >+    /* shrinking l1 table */
> >+    memcpy(new_l1_table, s->l1_table, new_l1_size2);
> >+
> >+    /* write new table (align to cluster) */
> >+    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
> >+
> >+    if ((new_l1_table_offset) >= boundary_size) {
> >+        goto fail;
> >+    }
> >+
> >+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
> >+    if (ret < 0) {
> >+        goto fail;
> >+    }
> 
> You don't need to flush the refblock cache.
> 
> >+
> >+    /* the L1 position has not yet been updated, so these clusters must
> >+     * indeed be completely free */
> >+    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
> >+                                        new_l1_size2);
> >+
> >+    if (ret < 0) {
> >+        goto fail;
> >+    }
> >+
> >+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
> >+
> >+    for (i = 0; i < new_l1_size; i++) {
> >+        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
> >+    }
> >+
> >+    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
> >+                           new_l1_table, new_l1_size2);
> >+    if (ret < 0) {
> >+        goto fail;
> >+    }
> >+
> >+    for (i = 0; i < new_l1_size; i++) {
> >+        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
> >+    }
> >+
> >+    /* set new table */
> >+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
> >+    cpu_to_be32w((uint32_t *)data, new_l1_size);
> >+    stq_be_p(data + 4, new_l1_table_offset);
> >+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
> >+                           data, sizeof(data));
> >+    if (ret < 0) {
> >+        goto fail;
> >+    }
> 
> I understand your intent with writing a new L1 table. But I don't see the
> need. With 64 kB clusters, a 64 kB L1 table can serve an image with 4 TB
> guest size (8192 L2 tables, each having 8192 entries). I don't think we'll
> gain a lot from shrinking the L1 table. It's too much effort.
> 
> >+
> >+    qemu_vfree(s->l1_table);
> >+    old_l1_table_offset = s->l1_table_offset;
> >+    s->l1_table_offset = new_l1_table_offset;
> >+    s->l1_table = new_l1_table;
> >+    old_l1_size = s->l1_size;
> >+    s->l1_size = new_l1_size;
> >+    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
> >+                        QCOW2_DISCARD_OTHER);
> >+    return 0;
> >+ fail:
> >+    qemu_vfree(new_l1_table);
> >+    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
> >+                        QCOW2_DISCARD_OTHER);
> >+    return ret;
> >+}
> >+
> >  /*
> >   * l2_load
> >   *
> >diff --git a/block/qcow2.c b/block/qcow2.c
> >index d031515..d2b0dfe 100644
> >--- a/block/qcow2.c
> >+++ b/block/qcow2.c
> >@@ -2111,10 +2111,41 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
> >          return -ENOTSUP;
> >      }
> >-    /* shrinking is currently not supported */
> >+    /* shrinking image */
> >      if (offset < bs->total_sectors * 512) {
> >-        error_report("qcow2 doesn't support shrinking images yet");
> >-        return -ENOTSUP;
> >+        /* As l1 table, l2 table, refcount table, refcount block table
> >+         * and file header of the qcow2 image need to use some clusters,
> >+         * so should subtract these metadata from offset.
> >+         */
> >+        int64_t nb_l1 = DIV_ROUND_UP((uint64_t)s->l1_size * sizeof(uint64_t),
> >+                                     s->cluster_size);
> >+        int64_t nb_l2 = DIV_ROUND_UP(offset, (uint64_t)s->l2_size <<
> >+                                     s->cluster_bits);
> >+        int64_t nb_refcount_block_table = DIV_ROUND_UP(offset, (uint64_t)
> >+                                                       s->cluster_size <<
> >+                                                       s->refcount_block_bits);
> >+        int64_t nb_refcount_table = DIV_ROUND_UP(nb_refcount_block_table << 3,
> >+                                                 s->cluster_size);
> >+        int64_t total_nb = 2 * nb_l2 + nb_l1 + nb_refcount_block_table +
> >+                           nb_refcount_table + 1;
> >+        int64_t offset_for_shrink = offset - (total_nb << s->cluster_bits);
> 
> Okay, what does total_nb have to do with offset?
> 
> total_nb is some host value. offset is a guest value. Don't mix them.
> 
> >+        int new_l2_index = offset_to_l2_index(s, offset_for_shrink);
> >+
> >+        new_l1_size = size_to_l1(s, offset_for_shrink);
> >+        ret = qcow2_shrink_l1_and_l2_table(bs, new_l1_size, new_l2_index,
> >+                                           offset);
> >+        if (ret < 0) {
> >+            return ret;
> >+        }
> >+
> >+        int64_t actual_size = bdrv_get_allocated_file_size(bs);
> >+
> >+        if (offset < actual_size) {
> >+            ret = bdrv_truncate(bs->file, offset);
> >+            if (ret < 0) {
> >+                return ret;
> >+            }
> >+        }
> 
> actual_size is not some offset you can use. It's the number of bytes
> allocated in the file. The file can contain holes. Therefore, comparing an
> offset against actual_size is wrong. Second, offset is the guest disk size,
> actual_size is the host disk size. They are not comparable.
> 
> Second, why are you truncating the file to offset if offset is smaller than
> actual_size? That is always wrong. You are blindly assuming:
> (1) that the image consists of only data and no metadata, because you're
> using the guest disk size (offset) to shrink the host file
> (2) that all that data is tightly packed at the beginning of the file

Yes, it exists above issues, thank you.

In my original thought, if do not truncate the file, will hit "virtual size <
disk size", so this looks uncomfortable. Based on this, I do bdrv_truncate
after qcow2_shrink_l1_and_l2_table. What's your opinion?

> (1) is obviously wrong. (2) is wrong as well, because clusters may be spread
> all over the host file. We would need defragmentation to solve that issue.
> 
> Therefore, to shorten the host file, you'd need to walk through the image
> and find the highest *host* cluster actually in use. Then you can truncate
> to after that cluster. However, I'd strongly advise against that because it
> doesn't bring any actual benefit.
> 
> Today's file systems support holes. Just discard all the clusters which get
> freed during shrinking, that will free up all that space. No need to
> defragment, no need to truncate. Yes, the file will look larger than it
> actually is, but nobody should care. We will need defragmentation to tackle
> that issue.
> 
> >      }
> >      new_l1_size = size_to_l1(s, offset);
> >diff --git a/block/qcow2.h b/block/qcow2.h
> >index 577ccd1..be1237d 100644
> >--- a/block/qcow2.h
> >+++ b/block/qcow2.h
> >@@ -516,6 +516,8 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
> >  /* qcow2-cluster.c functions */
> >  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
> >                          bool exact_size);
> >+int qcow2_shrink_l1_and_l2_table(BlockDriverState *bs, uint64_t new_l1_size,
> >+                                 int new_l2_index, int64_t boundary_size);
> >  int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
> >  void qcow2_l2_cache_reset(BlockDriverState *bs);
> >  int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
> 
> So, as for what I think we do need to do when shrinking (and keep in mind:
> The offset given to qcow2_truncate() is the guest size! NOT the host image
> size!):
> 
> (1) Determine the first L2 table and the first entry in the table which will
> lie beyond the new guest disk size.

Here is not correct always. Due to the COW, using offset to calculate the
first entry of the first L2 table will be incorrect.

What I have done for this scenario:
(1) if the first entry is the first entry of the L2 table, so will scan "the
previous L2 table"("the previous L2 table" location is in front of "L2 table"
in L1 table). If the entry of previous L2 table is larger than offset, will
discard this entry, too.
(2) If the first entry is not the first entry of the L2 table, still to scan
the whole L2 table to make sure no entry is beyond offset.

> (2) Discard all clusters beginning from there.
> (3) Discard all L2 tables which are then completely empty.
> (4) Update the header size.

For this patch current's realizion, have include above 4 steps I think.
Current patch, also have another step 5.
(5) truncate the file.

Here I think we also should add discard refcount table and refcount block
table when they are completely empty.

> 
> And that's it. We can either speed up step (2) by implementing it manually,
> or we just use bdrv_discard() on the qcow2 BDS (in the simplest case:
> bdrv_discard(bs, DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE), bs->total_sectors -
> DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE));.
> 
> We can incorporate step (3) by extending qcow2_discard_clusters() to free L2
> tables when they are empty after discard_single_l2(). But we don't even have
> to that now. It's an optimization we can go about later.
> 
> So, we can do (1), (2) and (3) in a single step: Just one bdrv_discard()
> call. But it's probably better to use qcow2_discard_clusters() instead and
> set the full_discard parameter to true.
> 
> So: qcow2_discard_clusters(bs, offset, bs->total_sectors - offset /
> BDRV_SECTOR_SIZE, true);. Then update the guest disk size field in the
> header. And we're done.
> 
> There are four problems with this approach:
> - qcow2_discard_clusters() might be slower than optimal. I personally don't
> care at all.
> - If "bs->total_sectors * BDRV_SECTOR_SIZE - offset" is greater than
> INT_MAX, this won't work. Trivially solvable by encapsulating the
> qcow2_discard_clusters() call in a loop which limits nb_clusters to INT_MAX
> / BDRV_SECTOR_SIZE.
> - The L1 table is not resized. Should not matter in practice at all.

Yes, agree with you.

> - The file is not truncated. Does not matter either (because holes in the
> file are good enough), and we can't actually solve this problem without
> defragmentation anyway.
> 
> There is one advantage:
> - It's extremely simple. It's literally below ten lines of code.
> 
> I think the advantage far outweighs the disadvantage. But I may be wrong.
> What do you think?

Hi max,

  Sorry for so late to reply as I am so busy recently. I think let's have an
agreement on how to realize qcow2 shrinking first, then type code is better.
Another issue, as gmail can not be used in current China, I have to use this
email to reply. :)

Regards,
Jun Li
Max Reitz Jan. 15, 2015, 6:47 p.m. UTC | #4
On 2015-01-03 at 07:23, Jun Li wrote:
> On Fri, 11/21 11:56, Max Reitz wrote:
>> So, as for what I think we do need to do when shrinking (and keep in mind:
>> The offset given to qcow2_truncate() is the guest size! NOT the host image
>> size!):
>>
>> (1) Determine the first L2 table and the first entry in the table which will
>> lie beyond the new guest disk size.
> Here is not correct always. Due to the COW, using offset to calculate the
> first entry of the first L2 table will be incorrect.

Again: This is *not* about the host disk size or the host offset of some 
cluster, but about the *guest* disk size.

Let's make up an example. You have a 2 GB disk but you want to resize it 
to 1.25 GB. The cluster size is 64 kB, therefore we have 2 GB / 64 kB = 
32,768 data clusters (as long as there aren't any internal snapshots, 
which is a prerequisite for resizing qcow2 images).

Every L2 table contains 65,536 / 8 = 8,192 entries; there are thus 
32,768 / 8,192 = 4 L2 tables.

As you can see, one can directly derive the number of data clusters and 
L2 tables from the guest disk size (as long as there aren't any internal 
snapshots).

So of course we can do the same for the target disk size: 1.25 GB / 64 
kB = 20,480 data clusters; 20,480 / 8,192 = 2.5 L2 tables, therefore we 
need three L2 tables but only half of the last one (4,096 entries).

We know that every cluster references somewhere after that limit (that 
is, every entry in the fourth L2 table and every entry starting with 
index 4,096 in the third L2 table) is a data cluster with a guest offset 
somewhere beyond 1.25 GB, so we don't need it anymore.

Thus, we simply discard all those data clusters and after that we can 
discard the fourth L2 table. That's it.

If we really want to we can calculate the highest cluster host offset in 
use and truncate the image accordingly. But that's optional, see the 
last point in my "problems with this approach" list (having discarded 
the clusters should save us all the space already). Furthermore, as I'm 
saying in that list, to really solve this issue, we'd need qcow2 
defragmentation.

> What I have done for this scenario:
> (1) if the first entry is the first entry of the L2 table, so will scan "the
> previous L2 table"("the previous L2 table" location is in front of "L2 table"
> in L1 table). If the entry of previous L2 table is larger than offset, will
> discard this entry, too.
> (2) If the first entry is not the first entry of the L2 table, still to scan
> the whole L2 table to make sure no entry is beyond offset.
>
>> (2) Discard all clusters beginning from there.
>> (3) Discard all L2 tables which are then completely empty.
>> (4) Update the header size.
> For this patch current's realizion, have include above 4 steps I think.
> Current patch, also have another step 5.
> (5) truncate the file.

As I wrote above, you can do that but it shouldn't matter much because 
the discarded clusters should not use any disk space.

> Here I think we also should add discard refcount table and refcount block
> table when they are completely empty.
>
>> And that's it. We can either speed up step (2) by implementing it manually,
>> or we just use bdrv_discard() on the qcow2 BDS (in the simplest case:
>> bdrv_discard(bs, DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE), bs->total_sectors -
>> DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE));.
>>
>> We can incorporate step (3) by extending qcow2_discard_clusters() to free L2
>> tables when they are empty after discard_single_l2(). But we don't even have
>> to that now. It's an optimization we can go about later.
>>
>> So, we can do (1), (2) and (3) in a single step: Just one bdrv_discard()
>> call. But it's probably better to use qcow2_discard_clusters() instead and
>> set the full_discard parameter to true.
>>
>> So: qcow2_discard_clusters(bs, offset, bs->total_sectors - offset /
>> BDRV_SECTOR_SIZE, true);. Then update the guest disk size field in the
>> header. And we're done.
>>
>> There are four problems with this approach:
>> - qcow2_discard_clusters() might be slower than optimal. I personally don't
>> care at all.
>> - If "bs->total_sectors * BDRV_SECTOR_SIZE - offset" is greater than
>> INT_MAX, this won't work. Trivially solvable by encapsulating the
>> qcow2_discard_clusters() call in a loop which limits nb_clusters to INT_MAX
>> / BDRV_SECTOR_SIZE.
>> - The L1 table is not resized. Should not matter in practice at all.
> Yes, agree with you.
>
>> - The file is not truncated. Does not matter either (because holes in the
>> file are good enough), and we can't actually solve this problem without
>> defragmentation anyway.
>>
>> There is one advantage:
>> - It's extremely simple. It's literally below ten lines of code.
>>
>> I think the advantage far outweighs the disadvantage. But I may be wrong.
>> What do you think?
> Hi max,
>
>    Sorry for so late to reply as I am so busy recently. I think let's have an
> agreement on how to realize qcow2 shrinking first, then type code is better.

Yes, this will probably be for the best. :-)

> Another issue, as gmail can not be used in current China, I have to use this
> email to reply. :)

No problem.

Max
Jun Li Jan. 19, 2015, 1:16 p.m. UTC | #5
On Thu, 01/15 13:47, Max Reitz wrote:
> On 2015-01-03 at 07:23, Jun Li wrote:
> >On Fri, 11/21 11:56, Max Reitz wrote:
> >>So, as for what I think we do need to do when shrinking (and keep in mind:
> >>The offset given to qcow2_truncate() is the guest size! NOT the host image
> >>size!):
> >>
> >>(1) Determine the first L2 table and the first entry in the table which will
> >>lie beyond the new guest disk size.
> >Here is not correct always. Due to the COW, using offset to calculate the
> >first entry of the first L2 table will be incorrect.
> 
> Again: This is *not* about the host disk size or the host offset of some
> cluster, but about the *guest* disk size.
> 
> Let's make up an example. You have a 2 GB disk but you want to resize it to
> 1.25 GB. The cluster size is 64 kB, therefore we have 2 GB / 64 kB = 32,768
> data clusters (as long as there aren't any internal snapshots, which is a
> prerequisite for resizing qcow2 images).
> 
> Every L2 table contains 65,536 / 8 = 8,192 entries; there are thus 32,768 /
> 8,192 = 4 L2 tables.
> 
> As you can see, one can directly derive the number of data clusters and L2
> tables from the guest disk size (as long as there aren't any internal
> snapshots).
> 
> So of course we can do the same for the target disk size: 1.25 GB / 64 kB =
> 20,480 data clusters; 20,480 / 8,192 = 2.5 L2 tables, therefore we need
> three L2 tables but only half of the last one (4,096 entries).
> 

Sorry, last time is my mis-understanding. If do not use qcow2_truncate(), I
think don't existing above issue.

For my original thought, I want to say:
Sometimes the second L2 table will contain some entry, the pointer in this
entry will point to a cluster which address is larger than 1.25 GB.

So if not use qcow2_truncate(), won't discard above cluster which address is
larger than 1.25 GB.

But I still have another worry.

Suppose "virtual size" and "disk size" are all 2G. After we resize it to
1.25G, seems we will get "virtual size" is 1.25G but "disk size" is still 2G
if do not use "qcow2_truncate()" to truncate the file(Yes, I know use
qcow2_truncate is not a resolution). This seems strange, not so perfect.

> We know that every cluster references somewhere after that limit (that is,
> every entry in the fourth L2 table and every entry starting with index 4,096
> in the third L2 table) is a data cluster with a guest offset somewhere
> beyond 1.25 GB, so we don't need it anymore.
> 
> Thus, we simply discard all those data clusters and after that we can
> discard the fourth L2 table. That's it.
> 
> If we really want to we can calculate the highest cluster host offset in use
> and truncate the image accordingly. But that's optional, see the last point
> in my "problems with this approach" list (having discarded the clusters
> should save us all the space already). Furthermore, as I'm saying in that
> list, to really solve this issue, we'd need qcow2 defragmentation.
> 

Do we already have "qcow2 defragmentation" realization?

Jun Li

> >What I have done for this scenario:
> >(1) if the first entry is the first entry of the L2 table, so will scan "the
> >previous L2 table"("the previous L2 table" location is in front of "L2 table"
> >in L1 table). If the entry of previous L2 table is larger than offset, will
> >discard this entry, too.
> >(2) If the first entry is not the first entry of the L2 table, still to scan
> >the whole L2 table to make sure no entry is beyond offset.
> >
> >>(2) Discard all clusters beginning from there.
> >>(3) Discard all L2 tables which are then completely empty.
> >>(4) Update the header size.
> >For this patch current's realizion, have include above 4 steps I think.
> >Current patch, also have another step 5.
> >(5) truncate the file.
> 
> As I wrote above, you can do that but it shouldn't matter much because the
> discarded clusters should not use any disk space.
> 
> >Here I think we also should add discard refcount table and refcount block
> >table when they are completely empty.
> >
> >>And that's it. We can either speed up step (2) by implementing it manually,
> >>or we just use bdrv_discard() on the qcow2 BDS (in the simplest case:
> >>bdrv_discard(bs, DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE), bs->total_sectors -
> >>DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE));.
> >>
> >>We can incorporate step (3) by extending qcow2_discard_clusters() to free L2
> >>tables when they are empty after discard_single_l2(). But we don't even have
> >>to that now. It's an optimization we can go about later.
> >>
> >>So, we can do (1), (2) and (3) in a single step: Just one bdrv_discard()
> >>call. But it's probably better to use qcow2_discard_clusters() instead and
> >>set the full_discard parameter to true.
> >>
> >>So: qcow2_discard_clusters(bs, offset, bs->total_sectors - offset /
> >>BDRV_SECTOR_SIZE, true);. Then update the guest disk size field in the
> >>header. And we're done.
> >>
> >>There are four problems with this approach:
> >>- qcow2_discard_clusters() might be slower than optimal. I personally don't
> >>care at all.
> >>- If "bs->total_sectors * BDRV_SECTOR_SIZE - offset" is greater than
> >>INT_MAX, this won't work. Trivially solvable by encapsulating the
> >>qcow2_discard_clusters() call in a loop which limits nb_clusters to INT_MAX
> >>/ BDRV_SECTOR_SIZE.
> >>- The L1 table is not resized. Should not matter in practice at all.
> >Yes, agree with you.
> >
> >>- The file is not truncated. Does not matter either (because holes in the
> >>file are good enough), and we can't actually solve this problem without
> >>defragmentation anyway.
> >>
> >>There is one advantage:
> >>- It's extremely simple. It's literally below ten lines of code.
> >>
> >>I think the advantage far outweighs the disadvantage. But I may be wrong.
> >>What do you think?
> >Hi max,
> >
> >   Sorry for so late to reply as I am so busy recently. I think let's have an
> >agreement on how to realize qcow2 shrinking first, then type code is better.
> 
> Yes, this will probably be for the best. :-)
> 
> >Another issue, as gmail can not be used in current China, I have to use this
> >email to reply. :)
> 
> No problem.
> 
> Max
Max Reitz Jan. 22, 2015, 7:14 p.m. UTC | #6
On 2015-01-19 at 08:16, Jun Li wrote:
> On Thu, 01/15 13:47, Max Reitz wrote:
>> On 2015-01-03 at 07:23, Jun Li wrote:
>>> On Fri, 11/21 11:56, Max Reitz wrote:
>>>> So, as for what I think we do need to do when shrinking (and keep in mind:
>>>> The offset given to qcow2_truncate() is the guest size! NOT the host image
>>>> size!):
>>>>
>>>> (1) Determine the first L2 table and the first entry in the table which will
>>>> lie beyond the new guest disk size.
>>> Here is not correct always. Due to the COW, using offset to calculate the
>>> first entry of the first L2 table will be incorrect.
>> Again: This is *not* about the host disk size or the host offset of some
>> cluster, but about the *guest* disk size.
>>
>> Let's make up an example. You have a 2 GB disk but you want to resize it to
>> 1.25 GB. The cluster size is 64 kB, therefore we have 2 GB / 64 kB = 32,768
>> data clusters (as long as there aren't any internal snapshots, which is a
>> prerequisite for resizing qcow2 images).
>>
>> Every L2 table contains 65,536 / 8 = 8,192 entries; there are thus 32,768 /
>> 8,192 = 4 L2 tables.
>>
>> As you can see, one can directly derive the number of data clusters and L2
>> tables from the guest disk size (as long as there aren't any internal
>> snapshots).
>>
>> So of course we can do the same for the target disk size: 1.25 GB / 64 kB =
>> 20,480 data clusters; 20,480 / 8,192 = 2.5 L2 tables, therefore we need
>> three L2 tables but only half of the last one (4,096 entries).
>>
> Sorry, last time is my mis-understanding. If do not use qcow2_truncate(), I
> think don't existing above issue.
>
> For my original thought, I want to say:
> Sometimes the second L2 table will contain some entry, the pointer in this
> entry will point to a cluster which address is larger than 1.25 GB.

Correct.

> So if not use qcow2_truncate(), won't discard above cluster which address is
> larger than 1.25 GB.

I'm sorry, I can't really follow what you are trying to say here, so 
I'll just try to reply with things that may or may not be what you 
wanted to talk about.

If you are using qemu-img resize and thus subsequently qcow2_truncate() 
to shrink an image, you cannot expect the image to shrink to the 
specified file length, for several reasons.

First, if you shrink it to 1 GB, but only half of that is actually used, 
the image might of course very well have a length below 1 GB.

Second, there is metadata overhead. So if you are changing the guest 
disk size to 1 GB (all of which is occupied), the host file size will 
exceed 1 GB because of that overhead.

Third, I keep repeating myself here, but file length is not file size. 
So you may observe a file length of 10 GB or more because the clusters 
are spread all over the image file. This is something we'd have to 
combat with defragmentation; but the question is whether we really need 
to (see below for more on that). The point is that it doesn't matter 
whether the image has a file length of 10 GB; the file size will be 
around 1 GB anyway.

> But I still have another worry.
>
> Suppose "virtual size" and "disk size" are all 2G. After we resize it to
> 1.25G, seems we will get "virtual size" is 1.25G but "disk size" is still 2G

No, it won't. I can prove it to you:

$ qemu-img create -f qcow2 test.qcow2 64M
$ qemu-io -c 'write 0 64M' test.qcow2
$ qemu-img info test.qcow2
...
disk size: 64M
...

Okay, so far it's just what we'd expect. Now let's implement my proposal 
for truncation: Let's assume the image should be shrinked to 32 MB, so 
we discard all clusters starting at 32 MB (guest offset) (which is 64 MB 
- 32 MB = 32 MB of data):

$ qemu-io -c 'discard 32M 32M' test.qcow2
$ qemu-img info test.qcow2
...
disk size: 32M
...

Great!

> if do not use "qcow2_truncate()" to truncate the file(Yes, I know use
> qcow2_truncate is not a resolution). This seems strange, not so perfect.
>
>> We know that every cluster references somewhere after that limit (that is,
>> every entry in the fourth L2 table and every entry starting with index 4,096
>> in the third L2 table) is a data cluster with a guest offset somewhere
>> beyond 1.25 GB, so we don't need it anymore.
>>
>> Thus, we simply discard all those data clusters and after that we can
>> discard the fourth L2 table. That's it.
>>
>> If we really want to we can calculate the highest cluster host offset in use
>> and truncate the image accordingly. But that's optional, see the last point
>> in my "problems with this approach" list (having discarded the clusters
>> should save us all the space already). Furthermore, as I'm saying in that
>> list, to really solve this issue, we'd need qcow2 defragmentation.
>>
> Do we already have "qcow2 defragmentation" realization?

No, we don't. The only way to defragment a qcow2 image right now is 
using qemu-img convert to create a (defragmented) copy and then delete 
the old image, which has the disadvantage of temporarily requiring 
double the disk space and being an offline operation.

So far, nobody has implemented online defragmentation, mainly for two 
reasons: It would probably be pretty complicated (it'd probably need to 
be a block job which links into a pretty low-level function provided by 
qcow2 (defragment_some_clusters or something)) and second, so far there 
has been little demand. Disk space is not an issue (as said before), 
because it doesn't really matter to a modern file system whether your 
file has a length of 100 MB of 100 GB; that's just some number. What 
really matters is how much of that space is actually used; and if all 
unused clusters are discarded, there won't be any space used for them 
(well, maybe there is some metadata overhead, but that should be 
negligible).

There are a couple of reasons why you'd want to defragment an image:

First, it makes you feel better. I can relate to that, but it's not a 
real reason.

Second, it may improve performance: The guest may expect consecutive 
reads to be fast; but if the clusters are sprinkled all over the host, 
consecutive guest reads no longer necessarily translate to consecutive 
reads on the host (same for writes, of course). Defragmentation would 
probably fix that, but if you want to rely on this, you'd better use 
preallocated image files.

Third, it looks better. People expect the file length to be raw 
indicator of the file size. However, for me this is related to "it makes 
you feel better", because this also is not a really good reason.

Fourth, using a non-modern file system may let your file size explode 
because suddenly, file length is actually equal to the file size. But I 
think, in this case you should just use a better file system.

I don't know whether "cp" copies holes in files; its manpage says it 
does create sparse images, but I don't know how well it works; but I 
just assume it works well enough.

Max

> Jun Li
Jun Li Jan. 27, 2015, 2:06 p.m. UTC | #7
On Thu, 01/22 14:14, Max Reitz wrote:
> On 2015-01-19 at 08:16, Jun Li wrote:
> >On Thu, 01/15 13:47, Max Reitz wrote:
> >>On 2015-01-03 at 07:23, Jun Li wrote:
> >>>On Fri, 11/21 11:56, Max Reitz wrote:
> >>>>So, as for what I think we do need to do when shrinking (and keep in mind:
> >>>>The offset given to qcow2_truncate() is the guest size! NOT the host image
> >>>>size!):
> >>>>
> >>>>(1) Determine the first L2 table and the first entry in the table which will
> >>>>lie beyond the new guest disk size.
> >>>Here is not correct always. Due to the COW, using offset to calculate the
> >>>first entry of the first L2 table will be incorrect.
> >>Again: This is *not* about the host disk size or the host offset of some
> >>cluster, but about the *guest* disk size.
> >>
> >>Let's make up an example. You have a 2 GB disk but you want to resize it to
> >>1.25 GB. The cluster size is 64 kB, therefore we have 2 GB / 64 kB = 32,768
> >>data clusters (as long as there aren't any internal snapshots, which is a
> >>prerequisite for resizing qcow2 images).
> >>
> >>Every L2 table contains 65,536 / 8 = 8,192 entries; there are thus 32,768 /
> >>8,192 = 4 L2 tables.
> >>
> >>As you can see, one can directly derive the number of data clusters and L2
> >>tables from the guest disk size (as long as there aren't any internal
> >>snapshots).
> >>
> >>So of course we can do the same for the target disk size: 1.25 GB / 64 kB =
> >>20,480 data clusters; 20,480 / 8,192 = 2.5 L2 tables, therefore we need
> >>three L2 tables but only half of the last one (4,096 entries).
> >>
> >Sorry, last time is my mis-understanding. If do not use qcow2_truncate(), I
> >think don't existing above issue.
> >
> >For my original thought, I want to say:
> >Sometimes the second L2 table will contain some entry, the pointer in this
> >entry will point to a cluster which address is larger than 1.25 GB.
> 
> Correct.
> 
> >So if not use qcow2_truncate(), won't discard above cluster which address is
> >larger than 1.25 GB.

Sorry, I do not express my meaning clearly. 

Here I want to say:

As some entry(let call it entry1) will point to a cluster(let call it
cluster1) which address is larger than 1.25GB, so if we use qcow2_truncate()
and will discard this cluster1. So entry1 will have an error after cluster1
discard. If do not use qcow2_truncate(), so won't discard cluster1.

> 
> I'm sorry, I can't really follow what you are trying to say here, so I'll
> just try to reply with things that may or may not be what you wanted to talk
> about.
> 
> If you are using qemu-img resize and thus subsequently qcow2_truncate() to
> shrink an image, you cannot expect the image to shrink to the specified file
> length, for several reasons.
> 
> First, if you shrink it to 1 GB, but only half of that is actually used, the
> image might of course very well have a length below 1 GB.
> 
> Second, there is metadata overhead. So if you are changing the guest disk
> size to 1 GB (all of which is occupied), the host file size will exceed 1 GB
> because of that overhead.
> 
> Third, I keep repeating myself here, but file length is not file size. So
> you may observe a file length of 10 GB or more because the clusters are
> spread all over the image file. This is something we'd have to combat with
> defragmentation; but the question is whether we really need to (see below
> for more on that). The point is that it doesn't matter whether the image has
> a file length of 10 GB; the file size will be around 1 GB anyway.
> 
> >But I still have another worry.
> >
> >Suppose "virtual size" and "disk size" are all 2G. After we resize it to
> >1.25G, seems we will get "virtual size" is 1.25G but "disk size" is still 2G
> 
> No, it won't. I can prove it to you:

Yes, you are right. I have double checked my PATCH v5. Seems I don't use
qcow2_process_discards(and this function will call bdrv_discard() to discard
cluster on host) in my patch v5. I will submit a new version of patch. Thanks.

Regards,
Jun Li

> 
> $ qemu-img create -f qcow2 test.qcow2 64M
> $ qemu-io -c 'write 0 64M' test.qcow2
> $ qemu-img info test.qcow2
> ...
> disk size: 64M
> ...
> 
> Okay, so far it's just what we'd expect. Now let's implement my proposal for
> truncation: Let's assume the image should be shrinked to 32 MB, so we
> discard all clusters starting at 32 MB (guest offset) (which is 64 MB - 32
> MB = 32 MB of data):
> 
> $ qemu-io -c 'discard 32M 32M' test.qcow2
> $ qemu-img info test.qcow2
> ...
> disk size: 32M
> ...
> 
> Great!
> 
> >if do not use "qcow2_truncate()" to truncate the file(Yes, I know use
> >qcow2_truncate is not a resolution). This seems strange, not so perfect.
> >
> >>We know that every cluster references somewhere after that limit (that is,
> >>every entry in the fourth L2 table and every entry starting with index 4,096
> >>in the third L2 table) is a data cluster with a guest offset somewhere
> >>beyond 1.25 GB, so we don't need it anymore.
> >>
> >>Thus, we simply discard all those data clusters and after that we can
> >>discard the fourth L2 table. That's it.
> >>
> >>If we really want to we can calculate the highest cluster host offset in use
> >>and truncate the image accordingly. But that's optional, see the last point
> >>in my "problems with this approach" list (having discarded the clusters
> >>should save us all the space already). Furthermore, as I'm saying in that
> >>list, to really solve this issue, we'd need qcow2 defragmentation.
> >>
> >Do we already have "qcow2 defragmentation" realization?
> 
> No, we don't. The only way to defragment a qcow2 image right now is using
> qemu-img convert to create a (defragmented) copy and then delete the old
> image, which has the disadvantage of temporarily requiring double the disk
> space and being an offline operation.
> 
> So far, nobody has implemented online defragmentation, mainly for two
> reasons: It would probably be pretty complicated (it'd probably need to be a
> block job which links into a pretty low-level function provided by qcow2
> (defragment_some_clusters or something)) and second, so far there has been
> little demand. Disk space is not an issue (as said before), because it
> doesn't really matter to a modern file system whether your file has a length
> of 100 MB of 100 GB; that's just some number. What really matters is how
> much of that space is actually used; and if all unused clusters are
> discarded, there won't be any space used for them (well, maybe there is some
> metadata overhead, but that should be negligible).
> 
> There are a couple of reasons why you'd want to defragment an image:
> 
> First, it makes you feel better. I can relate to that, but it's not a real
> reason.
> 
> Second, it may improve performance: The guest may expect consecutive reads
> to be fast; but if the clusters are sprinkled all over the host, consecutive
> guest reads no longer necessarily translate to consecutive reads on the host
> (same for writes, of course). Defragmentation would probably fix that, but
> if you want to rely on this, you'd better use preallocated image files.
> 
> Third, it looks better. People expect the file length to be raw indicator of
> the file size. However, for me this is related to "it makes you feel
> better", because this also is not a really good reason.
> 
> Fourth, using a non-modern file system may let your file size explode
> because suddenly, file length is actually equal to the file size. But I
> think, in this case you should just use a better file system.
> 
> I don't know whether "cp" copies holes in files; its manpage says it does
> create sparse images, but I don't know how well it works; but I just assume
> it works well enough.
> 
> Max
> 
> >Jun Li
>
diff mbox

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 4d888c7..28d2d62 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -29,6 +29,9 @@ 
 #include "block/qcow2.h"
 #include "trace.h"
 
+static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
+                   uint64_t **l2_table);
+
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size)
 {
@@ -135,6 +138,185 @@  int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
     return ret;
 }
 
+int qcow2_shrink_l1_and_l2_table(BlockDriverState *bs, uint64_t new_l1_size,
+                                 int new_l2_index, int64_t boundary_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int new_l1_size2, ret, i;
+    uint64_t *new_l1_table;
+    int64_t new_l1_table_offset;
+    int64_t old_l1_table_offset, old_l1_size;
+    uint8_t data[12];
+    uint64_t l2_offset;
+    uint64_t *l2_table, l2_entry;
+    int64_t l2_free_entry; /* The entry of l2 table need to free from */
+    uint64_t *old_l1_table = s->l1_table;
+    int num = s->l1_size - new_l1_size;
+
+    assert(new_l1_size <= s->l1_size);
+    while ((num >= -1) && (s->l1_size + num - 1 >= 0)) {
+        l2_free_entry = 0;
+        l2_offset = old_l1_table[s->l1_size + num - 1] & L1E_OFFSET_MASK;
+
+        if (l2_offset == 0) {
+            goto retry;
+        }
+
+        if (num == 0) {
+            if (new_l2_index == 0) {
+                goto retry;
+            }
+            l2_free_entry = new_l2_index;
+        }
+
+        /* load l2_table into cache */
+        ret = l2_load(bs, l2_offset, &l2_table);
+
+        if (ret < 0) {
+            return ret;
+        }
+
+        for (i = s->l2_size - 1; i >= 0; i--) {
+            l2_entry = be64_to_cpu(l2_table[i]);
+
+            /* Due to COW, the clusters in l2 table will
+             * not in sequential order, so there will be
+             * some l2_entry >= boundary_size when perform shrinking.
+             */
+            if (num == -1) {
+                if (l2_entry >= boundary_size) {
+                    goto free_cluster;
+                } else {
+                    continue;
+                }
+            }
+
+            /* Deal with COW clusters in l2 table when num == 0 */
+            if (i <= l2_free_entry - 1) {
+                if (l2_entry >= boundary_size) {
+                    goto free_cluster;
+                }
+                continue;
+            }
+
+            switch (qcow2_get_cluster_type(l2_entry)) {
+            case QCOW2_CLUSTER_UNALLOCATED:
+                if (!bs->backing_hd) {
+                    continue;
+                }
+                break;
+
+            case QCOW2_CLUSTER_ZERO:
+                continue;
+
+            case QCOW2_CLUSTER_NORMAL:
+            case QCOW2_CLUSTER_COMPRESSED:
+                break;
+
+            default:
+                abort();
+            }
+
+        free_cluster:
+            qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+
+            if (s->qcow_version >= 3) {
+                l2_table[i] = cpu_to_be64(QCOW_OFLAG_ZERO);
+            } else {
+                l2_table[i] = cpu_to_be64(0);
+            }
+
+            /* Then decrease the refcount */
+            qcow2_free_any_clusters(bs, l2_entry, 1, QCOW2_DISCARD_MAX);
+        }
+
+        ret = qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+        if (ret < 0) {
+            return ret;
+        }
+        if (l2_free_entry == 0 && num != -1) {
+            qemu_vfree(l2_table);
+            qcow2_free_clusters(bs, l2_offset, s->cluster_size - 1,
+                                QCOW2_DISCARD_OTHER);
+        }
+    retry:
+        num--;
+    }
+
+    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
+    new_l1_table = qemu_try_blockalign(bs->file,
+                                       align_offset(new_l1_size2, 512));
+    if (new_l1_table == NULL) {
+        return -ENOMEM;
+    }
+    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
+
+    /* shrinking l1 table */
+    memcpy(new_l1_table, s->l1_table, new_l1_size2);
+
+    /* write new table (align to cluster) */
+    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
+
+    if ((new_l1_table_offset) >= boundary_size) {
+        goto fail;
+    }
+
+    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* the L1 position has not yet been updated, so these clusters must
+     * indeed be completely free */
+    ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset,
+                                        new_l1_size2);
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
+
+    for (i = 0; i < new_l1_size; i++) {
+        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
+    }
+
+    ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset,
+                           new_l1_table, new_l1_size2);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    for (i = 0; i < new_l1_size; i++) {
+        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
+    }
+
+    /* set new table */
+    BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE);
+    cpu_to_be32w((uint32_t *)data, new_l1_size);
+    stq_be_p(data + 4, new_l1_table_offset);
+    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size),
+                           data, sizeof(data));
+    if (ret < 0) {
+        goto fail;
+    }
+
+    qemu_vfree(s->l1_table);
+    old_l1_table_offset = s->l1_table_offset;
+    s->l1_table_offset = new_l1_table_offset;
+    s->l1_table = new_l1_table;
+    old_l1_size = s->l1_size;
+    s->l1_size = new_l1_size;
+    qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t),
+                        QCOW2_DISCARD_OTHER);
+    return 0;
+ fail:
+    qemu_vfree(new_l1_table);
+    qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2,
+                        QCOW2_DISCARD_OTHER);
+    return ret;
+}
+
 /*
  * l2_load
  *
diff --git a/block/qcow2.c b/block/qcow2.c
index d031515..d2b0dfe 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2111,10 +2111,41 @@  static int qcow2_truncate(BlockDriverState *bs, int64_t offset)
         return -ENOTSUP;
     }
 
-    /* shrinking is currently not supported */
+    /* shrinking image */
     if (offset < bs->total_sectors * 512) {
-        error_report("qcow2 doesn't support shrinking images yet");
-        return -ENOTSUP;
+        /* As l1 table, l2 table, refcount table, refcount block table
+         * and file header of the qcow2 image need to use some clusters,
+         * so should subtract these metadata from offset.
+         */
+        int64_t nb_l1 = DIV_ROUND_UP((uint64_t)s->l1_size * sizeof(uint64_t),
+                                     s->cluster_size);
+        int64_t nb_l2 = DIV_ROUND_UP(offset, (uint64_t)s->l2_size <<
+                                     s->cluster_bits);
+        int64_t nb_refcount_block_table = DIV_ROUND_UP(offset, (uint64_t)
+                                                       s->cluster_size <<
+                                                       s->refcount_block_bits);
+        int64_t nb_refcount_table = DIV_ROUND_UP(nb_refcount_block_table << 3,
+                                                 s->cluster_size);
+        int64_t total_nb = 2 * nb_l2 + nb_l1 + nb_refcount_block_table +
+                           nb_refcount_table + 1;
+        int64_t offset_for_shrink = offset - (total_nb << s->cluster_bits);
+        int new_l2_index = offset_to_l2_index(s, offset_for_shrink);
+
+        new_l1_size = size_to_l1(s, offset_for_shrink);
+        ret = qcow2_shrink_l1_and_l2_table(bs, new_l1_size, new_l2_index,
+                                           offset);
+        if (ret < 0) {
+            return ret;
+        }
+
+        int64_t actual_size = bdrv_get_allocated_file_size(bs);
+
+        if (offset < actual_size) {
+            ret = bdrv_truncate(bs->file, offset);
+            if (ret < 0) {
+                return ret;
+            }
+        }
     }
 
     new_l1_size = size_to_l1(s, offset);
diff --git a/block/qcow2.h b/block/qcow2.h
index 577ccd1..be1237d 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -516,6 +516,8 @@  int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
+int qcow2_shrink_l1_and_l2_table(BlockDriverState *bs, uint64_t new_l1_size,
+                                 int new_l2_index, int64_t boundary_size);
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 void qcow2_l2_cache_reset(BlockDriverState *bs);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);