Patchwork [v4,2/5] qcow2-cluster: Expand zero clusters

login
register
mail settings
Submitter Max Reitz
Date Sept. 2, 2013, 10:04 a.m.
Message ID <1378116246-12855-3-git-send-email-mreitz@redhat.com>
Download mbox | patch
Permalink /patch/271850/
State New
Headers show

Comments

Max Reitz - Sept. 2, 2013, 10:04 a.m.
Add functionality for expanding zero clusters. This is necessary for
downgrading the image version to one without zero cluster support.

For non-backed images, this function may also just discard zero clusters
instead of truly expanding them.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c  | 228 +++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2-refcount.c |  29 ++++---
 block/qcow2.h          |   5 ++
 3 files changed, 248 insertions(+), 14 deletions(-)
Kevin Wolf - Sept. 2, 2013, 3:13 p.m.
Am 02.09.2013 um 12:04 hat Max Reitz geschrieben:
> Add functionality for expanding zero clusters. This is necessary for
> downgrading the image version to one without zero cluster support.
> 
> For non-backed images, this function may also just discard zero clusters
> instead of truly expanding them.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/qcow2-cluster.c  | 228 +++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2-refcount.c |  29 ++++---
>  block/qcow2.h          |   5 ++
>  3 files changed, 248 insertions(+), 14 deletions(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 2d5aa92..c90fb51 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -1497,3 +1497,231 @@ fail:
>  
>      return ret;
>  }
> +
> +/*
> + * Expands all zero clusters in a specific L1 table (or deallocates them, for
> + * non-backed non-pre-allocated zero clusters).
> + *
> + * expanded_clusters is a bitmap where every bit corresponds to one cluster in
> + * the image file; a bit gets set if the corresponding cluster has been used for
> + * zero expansion (i.e., has been filled with zeroes and is referenced from an
> + * L2 table). nb_clusters contains the total cluster count of the image file,
> + * i.e., the number of bits in expanded_clusters.
> + */
> +static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
> +                                      int l1_size, uint8_t *expanded_clusters,
> +                                      uint64_t nb_clusters)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    bool is_active_l1 = (l1_table == s->l1_table);
> +    uint64_t *l2_table = NULL;
> +    int ret;
> +    int i, j;
> +
> +    if (!is_active_l1) {
> +        /* inactive L2 tables require a buffer to be stored in when loading
> +         * them from disk */
> +        l2_table = qemu_blockalign(bs, s->cluster_size);
> +    }
> +
> +    for (i = 0; i < l1_size; i++) {
> +        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
> +        bool l2_dirty = false;
> +
> +        if (!l2_offset) {
> +            /* unallocated */
> +            continue;
> +        }
> +
> +        if (is_active_l1) {
> +            /* get active L2 tables from cache */
> +            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
> +                    (void **)&l2_table);
> +        } else {
> +            /* load inactive L2 tables from disk */
> +            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
> +                    (void *)l2_table, s->cluster_sectors);
> +        }
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +
> +        for (j = 0; j < s->l2_size; j++) {
> +            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
> +            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
> +            int cluster_type = qcow2_get_cluster_type(l2_entry);
> +
> +            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
> +                cluster_index = offset >> s->cluster_bits;
> +                assert((cluster_index >= 0) && (cluster_index < nb_clusters));
> +                if (expanded_clusters[cluster_index / 8] &
> +                    (1 << (cluster_index % 8))) {
> +                    /* Probably a shared L2 table; this cluster was a zero
> +                     * cluster which has been expanded, its refcount
> +                     * therefore most likely requires an update. */
> +                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
> +                                                        QCOW2_DISCARD_NEVER);
> +                    if (ret < 0) {
> +                        goto fail;
> +                    }
> +                    /* Since we just increased the refcount, the COPIED flag may
> +                     * no longer be set. */
> +                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
> +                    l2_dirty = true;
> +                }
> +                continue;
> +            }
> +            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
> +                continue;
> +            }
> +
> +            if (!offset) {
> +                /* not preallocated */
> +                if (!bs->backing_hd) {
> +                    /* not backed; therefore we can simply deallocate the
> +                     * cluster */
> +                    l2_table[j] = 0;
> +                    l2_dirty = true;
> +                    continue;
> +                }
> +
> +                offset = qcow2_alloc_clusters(bs, s->cluster_size);
> +                if (offset < 0) {
> +                    ret = offset;
> +                    goto fail;
> +                }
> +            }
> +
> +            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
> +                                                offset, s->cluster_size);
> +            if (ret < 0) {
> +                qcow2_free_clusters(bs, offset, s->cluster_size,
> +                        QCOW2_DISCARD_ALWAYS);
> +                goto fail;
> +            }
> +
> +            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
> +                                    s->cluster_sectors);
> +            if (ret < 0) {
> +                qcow2_free_clusters(bs, offset, s->cluster_size,
> +                        QCOW2_DISCARD_ALWAYS);
> +                goto fail;
> +            }
> +
> +            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
> +            l2_dirty = true;
> +
> +            cluster_index = offset >> s->cluster_bits;
> +            assert((cluster_index >= 0) && (cluster_index < nb_clusters));
> +            expanded_clusters[cluster_index / 8] |= 1 << (cluster_index % 8);
> +        }
> +
> +        if (is_active_l1) {
> +            if (l2_dirty) {
> +                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
> +                qcow2_cache_depends_on_flush(s->l2_table_cache);
> +            }
> +            ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
> +            if (ret < 0) {
> +                l2_table = NULL;
> +                goto fail;
> +            }
> +        } else {
> +            if (l2_dirty) {
> +                ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT &
> +                        ~(QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2), l2_offset,
> +                        s->cluster_size);
> +                if (ret < 0) {
> +                    goto fail;
> +                }
> +
> +                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
> +                        (void *)l2_table, s->cluster_sectors);
> +                if (ret < 0) {
> +                    goto fail;
> +                }
> +            }
> +        }
> +    }
> +
> +    ret = 0;
> +
> +fail:
> +    if (l2_table) {
> +        if (!is_active_l1) {
> +            qemu_vfree(l2_table);
> +        } else {
> +            if (ret < 0) {
> +                qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
> +            } else {
> +                ret = qcow2_cache_put(bs, s->l2_table_cache,
> +                        (void **)&l2_table);
> +            }
> +        }
> +    }
> +    return ret;
> +}
> +
> +/*
> + * For backed images, expands all zero clusters on the image. For non-backed
> + * images, deallocates all non-pre-allocated zero clusters (and claims the
> + * allocation for pre-allocated ones). This is important for downgrading to a
> + * qcow2 version which doesn't yet support metadata zero clusters.
> + */
> +int qcow2_expand_zero_clusters(BlockDriverState *bs)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    uint64_t *l1_table = NULL;
> +    uint64_t nb_clusters;
> +    uint8_t *expanded_clusters;
> +    int ret;
> +    int i, j;
> +
> +    nb_clusters = bs->total_sectors >> (s->cluster_bits - BDRV_SECTOR_BITS);

Shouldn't this value actually be rounded up?

> +    expanded_clusters = g_malloc0(nb_clusters / 8);

Just like the size here?

> +
> +    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
> +                                     expanded_clusters, nb_clusters);
> +    if (ret < 0) {
> +        goto fail;
> +    }
> +
> +    /* Inactive L1 tables may point to active L2 tables - therefore it is
> +     * necessary to flush the L2 table cache before trying to access the L2
> +     * tables pointed to by inactive L1 entries (else we might try to expand
> +     * zero clusters that have already been expanded) */
> +    ret = qcow2_cache_flush(bs, s->l2_table_cache);
> +    if (ret < 0) {
> +        goto fail;
> +    }

I'm afraid that this is not enough in fact. If you modify an active L2
table through an inactive L1, you bypass the cache and use bdrv_write().
The cache will still have a stale copy of the table.

Can't we just access inactive L2 tables through the cache as well? Or
can we get into trouble in other places if we do that? If so, we'd have
to implement a function that flushes _and empties_ the cache. (And then
we could always go the bdrv_read/write path, so there's some code to
save either way)

> +    for (i = 0; i < s->nb_snapshots; i++) {
> +        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
> +                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
> +
> +        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
> +
> +        ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset /
> +                BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +
> +        for (j = 0; j < s->snapshots[i].l1_size; j++) {
> +            be64_to_cpus(&l1_table[j]);
> +        }
> +
> +        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
> +                                         expanded_clusters, nb_clusters);
> +        if (ret < 0) {
> +            goto fail;
> +        }
> +    }
> +
> +    ret = 0;
> +
> +fail:
> +    g_free(expanded_clusters);
> +    g_free(l1_table);
> +    return ret;
> +}

Kevin
Max Reitz - Sept. 3, 2013, 7:11 a.m.
Am 02.09.2013 17:13, schrieb Kevin Wolf:
> Am 02.09.2013 um 12:04 hat Max Reitz geschrieben:
>> Add functionality for expanding zero clusters. This is necessary for
>> downgrading the image version to one without zero cluster support.
>>
>> For non-backed images, this function may also just discard zero clusters
>> instead of truly expanding them.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>   block/qcow2-cluster.c  | 228 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   block/qcow2-refcount.c |  29 ++++---
>>   block/qcow2.h          |   5 ++
>>   3 files changed, 248 insertions(+), 14 deletions(-)
>>
>> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
>> index 2d5aa92..c90fb51 100644
>> --- a/block/qcow2-cluster.c
>> +++ b/block/qcow2-cluster.c
>> @@ -1497,3 +1497,231 @@ fail:
>>   
>>       return ret;
>>   }
>> +
>> +/*
>> + * Expands all zero clusters in a specific L1 table (or deallocates them, for
>> + * non-backed non-pre-allocated zero clusters).
>> + *
>> + * expanded_clusters is a bitmap where every bit corresponds to one cluster in
>> + * the image file; a bit gets set if the corresponding cluster has been used for
>> + * zero expansion (i.e., has been filled with zeroes and is referenced from an
>> + * L2 table). nb_clusters contains the total cluster count of the image file,
>> + * i.e., the number of bits in expanded_clusters.
>> + */
>> +static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
>> +                                      int l1_size, uint8_t *expanded_clusters,
>> +                                      uint64_t nb_clusters)
>> +{
>> +    BDRVQcowState *s = bs->opaque;
>> +    bool is_active_l1 = (l1_table == s->l1_table);
>> +    uint64_t *l2_table = NULL;
>> +    int ret;
>> +    int i, j;
>> +
>> +    if (!is_active_l1) {
>> +        /* inactive L2 tables require a buffer to be stored in when loading
>> +         * them from disk */
>> +        l2_table = qemu_blockalign(bs, s->cluster_size);
>> +    }
>> +
>> +    for (i = 0; i < l1_size; i++) {
>> +        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
>> +        bool l2_dirty = false;
>> +
>> +        if (!l2_offset) {
>> +            /* unallocated */
>> +            continue;
>> +        }
>> +
>> +        if (is_active_l1) {
>> +            /* get active L2 tables from cache */
>> +            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
>> +                    (void **)&l2_table);
>> +        } else {
>> +            /* load inactive L2 tables from disk */
>> +            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
>> +                    (void *)l2_table, s->cluster_sectors);
>> +        }
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +
>> +        for (j = 0; j < s->l2_size; j++) {
>> +            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
>> +            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
>> +            int cluster_type = qcow2_get_cluster_type(l2_entry);
>> +
>> +            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
>> +                cluster_index = offset >> s->cluster_bits;
>> +                assert((cluster_index >= 0) && (cluster_index < nb_clusters));
>> +                if (expanded_clusters[cluster_index / 8] &
>> +                    (1 << (cluster_index % 8))) {
>> +                    /* Probably a shared L2 table; this cluster was a zero
>> +                     * cluster which has been expanded, its refcount
>> +                     * therefore most likely requires an update. */
>> +                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
>> +                                                        QCOW2_DISCARD_NEVER);
>> +                    if (ret < 0) {
>> +                        goto fail;
>> +                    }
>> +                    /* Since we just increased the refcount, the COPIED flag may
>> +                     * no longer be set. */
>> +                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
>> +                    l2_dirty = true;
>> +                }
>> +                continue;
>> +            }
>> +            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
>> +                continue;
>> +            }
>> +
>> +            if (!offset) {
>> +                /* not preallocated */
>> +                if (!bs->backing_hd) {
>> +                    /* not backed; therefore we can simply deallocate the
>> +                     * cluster */
>> +                    l2_table[j] = 0;
>> +                    l2_dirty = true;
>> +                    continue;
>> +                }
>> +
>> +                offset = qcow2_alloc_clusters(bs, s->cluster_size);
>> +                if (offset < 0) {
>> +                    ret = offset;
>> +                    goto fail;
>> +                }
>> +            }
>> +
>> +            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
>> +                                                offset, s->cluster_size);
>> +            if (ret < 0) {
>> +                qcow2_free_clusters(bs, offset, s->cluster_size,
>> +                        QCOW2_DISCARD_ALWAYS);
>> +                goto fail;
>> +            }
>> +
>> +            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
>> +                                    s->cluster_sectors);
>> +            if (ret < 0) {
>> +                qcow2_free_clusters(bs, offset, s->cluster_size,
>> +                        QCOW2_DISCARD_ALWAYS);
>> +                goto fail;
>> +            }
>> +
>> +            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
>> +            l2_dirty = true;
>> +
>> +            cluster_index = offset >> s->cluster_bits;
>> +            assert((cluster_index >= 0) && (cluster_index < nb_clusters));
>> +            expanded_clusters[cluster_index / 8] |= 1 << (cluster_index % 8);
>> +        }
>> +
>> +        if (is_active_l1) {
>> +            if (l2_dirty) {
>> +                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
>> +                qcow2_cache_depends_on_flush(s->l2_table_cache);
>> +            }
>> +            ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
>> +            if (ret < 0) {
>> +                l2_table = NULL;
>> +                goto fail;
>> +            }
>> +        } else {
>> +            if (l2_dirty) {
>> +                ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT &
>> +                        ~(QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2), l2_offset,
>> +                        s->cluster_size);
>> +                if (ret < 0) {
>> +                    goto fail;
>> +                }
>> +
>> +                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
>> +                        (void *)l2_table, s->cluster_sectors);
>> +                if (ret < 0) {
>> +                    goto fail;
>> +                }
>> +            }
>> +        }
>> +    }
>> +
>> +    ret = 0;
>> +
>> +fail:
>> +    if (l2_table) {
>> +        if (!is_active_l1) {
>> +            qemu_vfree(l2_table);
>> +        } else {
>> +            if (ret < 0) {
>> +                qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
>> +            } else {
>> +                ret = qcow2_cache_put(bs, s->l2_table_cache,
>> +                        (void **)&l2_table);
>> +            }
>> +        }
>> +    }
>> +    return ret;
>> +}
>> +
>> +/*
>> + * For backed images, expands all zero clusters on the image. For non-backed
>> + * images, deallocates all non-pre-allocated zero clusters (and claims the
>> + * allocation for pre-allocated ones). This is important for downgrading to a
>> + * qcow2 version which doesn't yet support metadata zero clusters.
>> + */
>> +int qcow2_expand_zero_clusters(BlockDriverState *bs)
>> +{
>> +    BDRVQcowState *s = bs->opaque;
>> +    uint64_t *l1_table = NULL;
>> +    uint64_t nb_clusters;
>> +    uint8_t *expanded_clusters;
>> +    int ret;
>> +    int i, j;
>> +
>> +    nb_clusters = bs->total_sectors >> (s->cluster_bits - BDRV_SECTOR_BITS);
> Shouldn't this value actually be rounded up?
Oh, yes, of course.

>> +    expanded_clusters = g_malloc0(nb_clusters / 8);
> Just like the size here?
>
>> +
>> +    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
>> +                                     expanded_clusters, nb_clusters);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
>> +
>> +    /* Inactive L1 tables may point to active L2 tables - therefore it is
>> +     * necessary to flush the L2 table cache before trying to access the L2
>> +     * tables pointed to by inactive L1 entries (else we might try to expand
>> +     * zero clusters that have already been expanded) */
>> +    ret = qcow2_cache_flush(bs, s->l2_table_cache);
>> +    if (ret < 0) {
>> +        goto fail;
>> +    }
> I'm afraid that this is not enough in fact. If you modify an active L2
> table through an inactive L1, you bypass the cache and use bdrv_write().
> The cache will still have a stale copy of the table.
Hm, yes, that didn't occur to me because the image isn't being written 
to after this zero cluster expansion; at least not in the tests I ran.

> Can't we just access inactive L2 tables through the cache as well? Or
> can we get into trouble in other places if we do that? If so, we'd have
> to implement a function that flushes _and empties_ the cache. (And then
> we could always go the bdrv_read/write path, so there's some code to
> save either way)
I'd personally go for emptying the cache.

>> +    for (i = 0; i < s->nb_snapshots; i++) {
>> +        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
>> +                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
>> +
>> +        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
>> +
>> +        ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset /
>> +                BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +
>> +        for (j = 0; j < s->snapshots[i].l1_size; j++) {
>> +            be64_to_cpus(&l1_table[j]);
>> +        }
>> +
>> +        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
>> +                                         expanded_clusters, nb_clusters);
>> +        if (ret < 0) {
>> +            goto fail;
>> +        }
>> +    }
>> +
>> +    ret = 0;
>> +
>> +fail:
>> +    g_free(expanded_clusters);
>> +    g_free(l1_table);
>> +    return ret;
>> +}
> Kevin

Max

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 2d5aa92..c90fb51 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1497,3 +1497,231 @@  fail:
 
     return ret;
 }
+
+/*
+ * Expands all zero clusters in a specific L1 table (or deallocates them, for
+ * non-backed non-pre-allocated zero clusters).
+ *
+ * expanded_clusters is a bitmap where every bit corresponds to one cluster in
+ * the image file; a bit gets set if the corresponding cluster has been used for
+ * zero expansion (i.e., has been filled with zeroes and is referenced from an
+ * L2 table). nb_clusters contains the total cluster count of the image file,
+ * i.e., the number of bits in expanded_clusters.
+ */
+static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
+                                      int l1_size, uint8_t *expanded_clusters,
+                                      uint64_t nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+    bool is_active_l1 = (l1_table == s->l1_table);
+    uint64_t *l2_table = NULL;
+    int ret;
+    int i, j;
+
+    if (!is_active_l1) {
+        /* inactive L2 tables require a buffer to be stored in when loading
+         * them from disk */
+        l2_table = qemu_blockalign(bs, s->cluster_size);
+    }
+
+    for (i = 0; i < l1_size; i++) {
+        uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK;
+        bool l2_dirty = false;
+
+        if (!l2_offset) {
+            /* unallocated */
+            continue;
+        }
+
+        if (is_active_l1) {
+            /* get active L2 tables from cache */
+            ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
+                    (void **)&l2_table);
+        } else {
+            /* load inactive L2 tables from disk */
+            ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                    (void *)l2_table, s->cluster_sectors);
+        }
+        if (ret < 0) {
+            goto fail;
+        }
+
+        for (j = 0; j < s->l2_size; j++) {
+            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+            int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index;
+            int cluster_type = qcow2_get_cluster_type(l2_entry);
+
+            if (cluster_type == QCOW2_CLUSTER_NORMAL) {
+                cluster_index = offset >> s->cluster_bits;
+                assert((cluster_index >= 0) && (cluster_index < nb_clusters));
+                if (expanded_clusters[cluster_index / 8] &
+                    (1 << (cluster_index % 8))) {
+                    /* Probably a shared L2 table; this cluster was a zero
+                     * cluster which has been expanded, its refcount
+                     * therefore most likely requires an update. */
+                    ret = qcow2_update_cluster_refcount(bs, cluster_index, 1,
+                                                        QCOW2_DISCARD_NEVER);
+                    if (ret < 0) {
+                        goto fail;
+                    }
+                    /* Since we just increased the refcount, the COPIED flag may
+                     * no longer be set. */
+                    l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED);
+                    l2_dirty = true;
+                }
+                continue;
+            }
+            else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) {
+                continue;
+            }
+
+            if (!offset) {
+                /* not preallocated */
+                if (!bs->backing_hd) {
+                    /* not backed; therefore we can simply deallocate the
+                     * cluster */
+                    l2_table[j] = 0;
+                    l2_dirty = true;
+                    continue;
+                }
+
+                offset = qcow2_alloc_clusters(bs, s->cluster_size);
+                if (offset < 0) {
+                    ret = offset;
+                    goto fail;
+                }
+            }
+
+            ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+                                                offset, s->cluster_size);
+            if (ret < 0) {
+                qcow2_free_clusters(bs, offset, s->cluster_size,
+                        QCOW2_DISCARD_ALWAYS);
+                goto fail;
+            }
+
+            ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE,
+                                    s->cluster_sectors);
+            if (ret < 0) {
+                qcow2_free_clusters(bs, offset, s->cluster_size,
+                        QCOW2_DISCARD_ALWAYS);
+                goto fail;
+            }
+
+            l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED);
+            l2_dirty = true;
+
+            cluster_index = offset >> s->cluster_bits;
+            assert((cluster_index >= 0) && (cluster_index < nb_clusters));
+            expanded_clusters[cluster_index / 8] |= 1 << (cluster_index % 8);
+        }
+
+        if (is_active_l1) {
+            if (l2_dirty) {
+                qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table);
+                qcow2_cache_depends_on_flush(s->l2_table_cache);
+            }
+            ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+            if (ret < 0) {
+                l2_table = NULL;
+                goto fail;
+            }
+        } else {
+            if (l2_dirty) {
+                ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT &
+                        ~(QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2), l2_offset,
+                        s->cluster_size);
+                if (ret < 0) {
+                    goto fail;
+                }
+
+                ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE,
+                        (void *)l2_table, s->cluster_sectors);
+                if (ret < 0) {
+                    goto fail;
+                }
+            }
+        }
+    }
+
+    ret = 0;
+
+fail:
+    if (l2_table) {
+        if (!is_active_l1) {
+            qemu_vfree(l2_table);
+        } else {
+            if (ret < 0) {
+                qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
+            } else {
+                ret = qcow2_cache_put(bs, s->l2_table_cache,
+                        (void **)&l2_table);
+            }
+        }
+    }
+    return ret;
+}
+
+/*
+ * For backed images, expands all zero clusters on the image. For non-backed
+ * images, deallocates all non-pre-allocated zero clusters (and claims the
+ * allocation for pre-allocated ones). This is important for downgrading to a
+ * qcow2 version which doesn't yet support metadata zero clusters.
+ */
+int qcow2_expand_zero_clusters(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table = NULL;
+    uint64_t nb_clusters;
+    uint8_t *expanded_clusters;
+    int ret;
+    int i, j;
+
+    nb_clusters = bs->total_sectors >> (s->cluster_bits - BDRV_SECTOR_BITS);
+    expanded_clusters = g_malloc0(nb_clusters / 8);
+
+    ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size,
+                                     expanded_clusters, nb_clusters);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /* Inactive L1 tables may point to active L2 tables - therefore it is
+     * necessary to flush the L2 table cache before trying to access the L2
+     * tables pointed to by inactive L1 entries (else we might try to expand
+     * zero clusters that have already been expanded) */
+    ret = qcow2_cache_flush(bs, s->l2_table_cache);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    for (i = 0; i < s->nb_snapshots; i++) {
+        int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) +
+                BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE;
+
+        l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE);
+
+        ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset /
+                BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        for (j = 0; j < s->snapshots[i].l1_size; j++) {
+            be64_to_cpus(&l1_table[j]);
+        }
+
+        ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size,
+                                         expanded_clusters, nb_clusters);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+
+    ret = 0;
+
+fail:
+    g_free(expanded_clusters);
+    g_free(l1_table);
+    return ret;
+}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index ba129de..4264148 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -601,10 +601,10 @@  fail:
  * If the return value is non-negative, it is the new refcount of the cluster.
  * If it is negative, it is -errno and indicates an error.
  */
-static int update_cluster_refcount(BlockDriverState *bs,
-                                   int64_t cluster_index,
-                                   int addend,
-                                   enum qcow2_discard_type type)
+int qcow2_update_cluster_refcount(BlockDriverState *bs,
+                                  int64_t cluster_index,
+                                  int addend,
+                                  enum qcow2_discard_type type)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
@@ -733,8 +733,8 @@  int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
         if (free_in_cluster == 0)
             s->free_byte_offset = 0;
         if ((offset & (s->cluster_size - 1)) != 0)
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                    QCOW2_DISCARD_NEVER);
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                          QCOW2_DISCARD_NEVER);
     } else {
         offset = qcow2_alloc_clusters(bs, s->cluster_size);
         if (offset < 0) {
@@ -744,8 +744,8 @@  int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
         if ((cluster_offset + s->cluster_size) == offset) {
             /* we are lucky: contiguous data */
             offset = s->free_byte_offset;
-            update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                    QCOW2_DISCARD_NEVER);
+            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
+                                          QCOW2_DISCARD_NEVER);
             s->free_byte_offset += size;
         } else {
             s->free_byte_offset = offset;
@@ -754,8 +754,8 @@  int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
     }
 
     /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
-     * or explicitly by update_cluster_refcount().  Refcount blocks must be
-     * flushed before the caller's L2 table updates.
+     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
+     * be flushed before the caller's L2 table updates.
      */
     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
     return offset;
@@ -896,8 +896,9 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
                             break;
                         }
                         if (addend != 0) {
-                            refcount = update_cluster_refcount(bs, cluster_index, addend,
-                                                               QCOW2_DISCARD_SNAPSHOT);
+                            refcount = qcow2_update_cluster_refcount(bs,
+                                    cluster_index, addend,
+                                    QCOW2_DISCARD_SNAPSHOT);
                         } else {
                             refcount = get_refcount(bs, cluster_index);
                         }
@@ -936,8 +937,8 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
 
             if (addend != 0) {
-                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend,
-                                                   QCOW2_DISCARD_SNAPSHOT);
+                refcount = qcow2_update_cluster_refcount(bs, l2_offset >>
+                        s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT);
             } else {
                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
             }
diff --git a/block/qcow2.h b/block/qcow2.h
index 10b7bf4..25176ac 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -406,6 +406,9 @@  int qcow2_update_header(BlockDriverState *bs);
 int qcow2_refcount_init(BlockDriverState *bs);
 void qcow2_refcount_close(BlockDriverState *bs);
 
+int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index,
+                                  int addend, enum qcow2_discard_type type);
+
 int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
 int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
     int nb_clusters);
@@ -453,6 +456,8 @@  int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
     int nb_sectors);
 int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors);
 
+int qcow2_expand_zero_clusters(BlockDriverState *bs);
+
 /* qcow2-snapshot.c functions */
 int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
 int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);