diff mbox

[4/5] qcow2: Check allocations in qcow2_check

Message ID 1377522260-16676-5-git-send-email-mreitz@redhat.com
State New
Headers show

Commit Message

Max Reitz Aug. 26, 2013, 1:04 p.m. UTC
Adds a new function checking for overlapping cluster allocations.
Furthermore, qcow2_check now marks the image as consistent if no
corruptions have been found.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 414 ++++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.c         |  15 +-
 block/qcow2.h         |   2 +
 3 files changed, 429 insertions(+), 2 deletions(-)

Comments

Max Reitz Aug. 26, 2013, 1:15 p.m. UTC | #1
Hi,

On Mon, Aug 26, 2013 at 15:04 +0200, Max Reitz wrote:
> Adds a new function checking for overlapping cluster allocations.
> Furthermore, qcow2_check now marks the image as consistent if no
> corruptions have been found.
>
> Signed-off-by: Max Reitz <mreitz@redhat.com>
Such overlappings are often (if not always) found by the refcount checks 
as well. By increasing the refcount of the clusters affected and 
therefore enforcing COW, the collision is basically fixed as well 
without requiring this new function.

Thus I'm not sure whether this new function is actually necessary since 
the existing refcount check seems to cover all its functionality already.

But, well, I thought it couldn't hurt proposing this function at least. ;-)


Kind regards,

Max
Kevin Wolf Aug. 27, 2013, 12:23 p.m. UTC | #2
Am 26.08.2013 um 15:04 hat Max Reitz geschrieben:
> Adds a new function checking for overlapping cluster allocations.
> Furthermore, qcow2_check now marks the image as consistent if no
> corruptions have been found.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/qcow2-cluster.c | 414 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2.c         |  15 +-
>  block/qcow2.h         |   2 +
>  3 files changed, 429 insertions(+), 2 deletions(-)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index be35983..ea7d334 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -1499,3 +1499,417 @@ fail:
>  
>      return ret;
>  }
> +
> +static const char *const overlap_strings[8] = {
> +    "main header",
> +    "active L1 table",
> +    "active L2 table",
> +    "refcount table",
> +    "refcount block",
> +    "snapshot table",
> +    "inactive L1 table",
> +    "inactive L2 table"
> +};
> +
> +/*
> + * Checks the table and cluster allocations for inconsistencies.
> + */
> +int qcow2_check_allocations(BlockDriverState *bs, BdrvCheckResult *result,
> +                            BdrvCheckMode fix)

For the record: We have decided to leave this out as the required
functionality is already provided by the refcount checks.

> diff --git a/block/qcow2.c b/block/qcow2.c
> index 95497c6..0b0f0ac 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -311,8 +311,19 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
>          return ret;
>      }
>  
> -    if (fix && result->check_errors == 0 && result->corruptions == 0) {
> -        return qcow2_mark_clean(bs);
> +    ret = qcow2_check_allocations(bs, result, fix);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    if (result->check_errors == 0 && result->corruptions == 0) {
> +        if (fix) {
> +            ret = qcow2_mark_clean(bs);
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +        return qcow2_mark_consistent(bs);

The qcow2_mark_consistent() call should probably be inside the if block,
otherwise you'll be trying to update the header of read-only image.

The reason why you didn't notice this is that qemu-img is broken since
commit 8599ea4c and doesn't report errors on negative return values any
more...

Kevin
diff mbox

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index be35983..ea7d334 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1499,3 +1499,417 @@  fail:
 
     return ret;
 }
+
+static const char *const overlap_strings[8] = {
+    "main header",
+    "active L1 table",
+    "active L2 table",
+    "refcount table",
+    "refcount block",
+    "snapshot table",
+    "inactive L1 table",
+    "inactive L2 table"
+};
+
+/*
+ * Checks the table and cluster allocations for inconsistencies.
+ */
+int qcow2_check_allocations(BlockDriverState *bs, BdrvCheckResult *result,
+                            BdrvCheckMode fix)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+    int i, j;
+    uint8_t *bm; /* bitmap */
+    /* bitmap byte shift (bytes per bitmap-bit = 1 << bm_shift) */
+    int bm_shift = s->cluster_bits;
+    uint64_t bm_size;
+
+    /* Checking every data cluster through qcow2_check_metadata_overlap takes a
+     * considerable amount of time; a bitmap would help out very much, although
+     * it quickly assumes ridiculous sizes if not limited; therefore we're
+     * grouping several clusters together to not exceed the size of the L1 table
+     * in RAM (which seems a reasonable limit). */
+    while ((bm_size = bs->total_sectors >> (bm_shift++ - BDRV_SECTOR_BITS)) >
+        s->l1_size * sizeof(uint64_t));
+
+    bm = g_malloc0(++bm_size);
+
+    bm[0] = 1; /* main header */
+    /* active L1 */
+    for (i = 0; i < s->l1_size * sizeof(uint64_t) >> bm_shift; i++) {
+        uintptr_t bit = i + (s->l1_table_offset >> bm_shift);
+        if (bit / 8 >= bm_size) {
+            fprintf(stderr, "ERROR L1 table offset is after EOF\n");
+            result->corruptions++;
+            /* this is such a fundamental error that it will probably crash
+             * qcow2_check_metadata_overlap if this function doesn't return
+             * already here (qcow2_check_metadata_overlap is for keeping
+             * consistency, therefore it always assumes a pretty consistent
+             * image) */
+            ret = 0;
+            goto fail;
+        }
+        bm[bit / 8] |= 1 << (bit % 8);
+    }
+
+    if (s->l1_table) {
+        /* active L2 */
+        for (i = 0; i < s->l1_size; i++) {
+            uintptr_t bit = (s->l1_table[i] & L1E_OFFSET_MASK) >> bm_shift;
+            if (bit / 8 >= bm_size) {
+                fprintf(stderr, "ERROR L2 table %i offset is after EOF\n", i);
+                result->corruptions++;
+                ret = 0;
+                goto fail;
+            }
+            /* the test for j != 0 is unneccessary, since that bit is set
+             * anyway (because of the main header) */
+            bm[bit / 8] |= 1 << (bit % 8);
+        }
+    }
+
+    /* snapshot table */
+    for (i = 0; i < s->snapshots_size >> bm_shift; i++) {
+        uintptr_t bit = i + (s->snapshots_offset >> bm_shift);
+        if (bit / 8 >= bm_size) {
+            fprintf(stderr, "ERROR snapshot table offset is after EOF\n");
+            result->corruptions++;
+            ret = 0;
+            goto fail;
+        }
+        bm[bit / 8] |= 1 << (bit % 8);
+    }
+
+    if (s->snapshots) {
+        /* inactive L1 */
+        for (i = 0; i < s->nb_snapshots; i++) {
+            for (j = 0; j < s->snapshots[i].l1_size * sizeof(uint64_t) >>
+                bm_shift; j++) {
+                uintptr_t bit =
+                        j + (s->snapshots[i].l1_table_offset >> bm_shift);
+                if (bit / 8 >= bm_size) {
+                    fprintf(stderr, "ERROR inactive L1 table %i offset is "
+                            "after EOF\n", i);
+                    result->corruptions++;
+                    ret = 0;
+                    goto fail;
+                }
+                bm[bit / 8] |= 1 << (bit % 8);
+            }
+        }
+    }
+
+    /* refcount table */
+    for (i = 0; i < s->refcount_table_size * sizeof(uint64_t) >> bm_shift; i++)
+    {
+        uintptr_t bit = i + (s->refcount_table_offset >> bm_shift);
+        if (bit / 8 >= bm_size) {
+            fprintf(stderr, "ERROR refcount table offset is after EOF\n");
+            result->corruptions++;
+            ret = 0;
+            goto fail;
+        }
+        bm[bit / 8] |= 1 << (bit % 8);
+    }
+
+    if (s->refcount_table) {
+        int initial_corruptions = result->corruptions;
+
+        /* Do the refcount blocks last; that way, we can directly use the bitmap
+         * to check for overlaps */
+        for (i = 0; i < s->refcount_table_size; i++) {
+            uint64_t rb_offset = s->refcount_table[i] & REFT_OFFSET_MASK;
+            uintptr_t bit = rb_offset >> bm_shift;
+
+            if (!rb_offset) {
+                /* unallocated */
+                continue;
+            }
+
+            if (rb_offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR refcount block %i is not cluster "
+                        "aligned in image\n", i);
+                result->corruptions++;
+                continue;
+            }
+
+            if (bit / 8 >= bm_size) {
+                fprintf(stderr, "ERROR refcount block %i offset is after EOF\n",
+                        i);
+                result->corruptions++;
+                ret = 0;
+                goto fail;
+            }
+
+            if (bm[bit / 8] & (1 << (bit % 8))) {
+                ret = qcow2_check_metadata_overlap(bs,
+                        QCOW2_OL_CACHED & ~QCOW2_OL_REFCOUNT_BLOCK, rb_offset,
+                        s->cluster_size);
+                if (ret > 0) {
+                    fprintf(stderr, "ERROR refcount block %i overlaps with %s\n", i,
+                            overlap_strings[ffs(ret) - 1]);
+                    result->corruptions++;
+                } else if (ret < 0) {
+                    fprintf(stderr, "ERROR could not check refcount block %i overlap: "
+                            "%s\n", i, strerror(-ret));
+                    result->check_errors++;
+                    goto fail;
+                }
+            }
+        }
+
+        if (result->corruptions != initial_corruptions) {
+            ret = 0;
+            goto fail;
+        }
+
+        /* Now, enter the refcount blocks into the bitmap */
+        for (i = 0; i < s->refcount_table_size; i++) {
+            uint64_t rb_offset = s->refcount_table[i] & REFT_OFFSET_MASK;
+            uintptr_t bit = rb_offset >> bm_shift;
+            bm[bit / 8] |= 1 << (bit % 8);
+        }
+    }
+
+    ret = qcow2_check_metadata_overlap(bs, QCOW2_OL_MAIN_HEADER |
+            QCOW2_OL_REFCOUNT_TABLE | QCOW2_OL_SNAPSHOT_TABLE,
+            s->l1_table_offset, s->l1_size * sizeof(uint64_t));
+    if (ret > 0) {
+        fprintf(stderr, "ERROR L1 offset intersects %s\n",
+                overlap_strings[ffs(ret) - 1]);
+        result->corruptions++;
+        ret = 0;
+        goto fail;
+    } else if (ret < 0) {
+        fprintf(stderr, "ERROR could not check L1 overlap: %s\n",
+                strerror(-ret));
+        result->check_errors++;
+        goto fail;
+    }
+
+    if (s->l1_table) {
+        for (i = 0; i < s->l1_size; i++) {
+            uint64_t l2_offset = s->l1_table[i] & L1E_OFFSET_MASK;
+            uint64_t *l2;
+
+            if (!l2_offset) {
+                /* unallocated */
+                continue;
+            }
+
+            if (l2_offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR L2 %i is not cluster aligned; L1 table "
+                        "entry corrupted\n", i);
+                result->corruptions++;
+                continue;
+            }
+
+            ret = qcow2_check_metadata_overlap(bs,
+                    QCOW2_OL_CACHED & ~QCOW2_OL_ACTIVE_L2, l2_offset,
+                    s->cluster_size);
+            if (ret > 0) {
+                fprintf(stderr, "ERROR L2 %i overlaps with %s\n", i,
+                        overlap_strings[ffs(ret) - 1]);
+                result->corruptions++;
+                continue;
+            } else if (ret < 0) {
+                fprintf(stderr, "ERROR could not check L2 %i overlap: %s\n", i,
+                        strerror(-ret));
+                result->check_errors++;
+                goto fail;
+            }
+
+            ret = l2_load(bs, l2_offset, &l2);
+            if ((ret < 0) || !l2) {
+                fprintf(stderr, "ERROR could not load L2 %i: %s\n", i,
+                        strerror(-ret));
+                result->check_errors++;
+                if (ret < 0) {
+                    goto fail;
+                } else {
+                    continue;
+                }
+            }
+
+            for (j = 0; j < s->l2_size; j++) {
+                uint64_t cl_offset, cl_size;
+                unsigned long long cl_index;
+                uint64_t l2_entry = be64_to_cpu(l2[j]);
+
+                if ((s->qcow_version >= 3) && (l2_entry & 1)) {
+                    /* cluster reads as all zeroes */
+                    continue;
+                }
+
+                if (l2_entry & (1ULL << 62)) { /* compressed */
+                    int xp1 = 62 - (s->cluster_bits - 9);
+                    cl_offset = l2_entry & ((1 << xp1) - 1);
+                    cl_size = ((l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK) >>
+                            xp1) * BDRV_SECTOR_SIZE;
+                } else { /* uncompressed */
+                    cl_offset = l2_entry & L2E_OFFSET_MASK;
+                    cl_size = s->cluster_size;
+                }
+
+                if (!cl_offset) {
+                    /* unallocated */
+                    continue;
+                }
+
+                cl_index = ((unsigned long long)i << s->l2_bits) | j;
+
+                if (!(l2_entry & (1ULL << 62)) &&
+                    cl_offset & (s->cluster_size - 1)) {
+                    fprintf(stderr, "ERROR data cluster 0x%llx is not cluster "
+                            "aligned in image\n", cl_index);
+                    result->corruptions++;
+                    continue;
+                }
+
+                if (!(bm[(cl_offset >> bm_shift) / 8] &
+                    (1 << ((cl_offset >> bm_shift) % 8)))) {
+                    continue;
+                }
+
+                ret = qcow2_check_metadata_overlap(bs, QCOW2_OL_CACHED, cl_offset,
+                                                   cl_size);
+                if (ret > 0) {
+                    /* TODO: fix compressed clusters */
+                    if ((fix & BDRV_FIX_ERRORS) && !(l2_entry & (1ULL << 62))) {
+                        void *content = g_malloc(s->cluster_size);
+                        uint64_t sector = cl_index <<
+                                (s->cluster_bits - BDRV_SECTOR_BITS);
+
+                        fprintf(stderr, "Reallocating data cluster 0x%llx "
+                                "(currently overlapping with %s)\n", cl_index,
+                                overlap_strings[ffs(ret) - 1]);
+
+                        ret = bdrv_read(bs, sector, content,
+                                        s->cluster_sectors);
+                        if (ret < 0) {
+                            fprintf(stderr, "ERROR could not read cluster: "
+                                    "%s\n", strerror(-ret));
+                            result->corruptions++;
+                            g_free(content);
+                            qcow2_cache_put(bs, s->l2_table_cache,
+                                            (void **)&l2);
+                            goto fail;
+                        }
+
+                        /* unallocate cluster (but leaving the refcount for
+                         * now) */
+                        l2[j] = 0;
+
+                        /* FIXME: the image obviously is inconsistent, so this
+                         * write access may trigger new inconsistencies
+                         * (although the refcount check, which is supposed to be
+                         * performed before these checks, should take care of
+                         * this) */
+                        ret = bdrv_write(bs, sector, content,
+                                         s->cluster_sectors);
+                        if (ret < 0) {
+                            /* at least leave it as it was */
+                            l2[j] = cpu_to_be64(l2_entry);
+
+                            fprintf(stderr, "ERROR could not write cluster: "
+                                    "%s\n", strerror(-ret));
+                            result->corruptions++;
+                            g_free(content);
+                            qcow2_cache_put(bs, s->l2_table_cache,
+                                    (void **)&l2);
+                            goto fail;
+                        }
+
+                        g_free(content);
+                        result->corruptions_fixed++;
+
+                        /* Now decrement the refcount (although an invalid
+                         * refcount might have been the problem in the first
+                         * place, the refcount check (which is running before
+                         * this) will have fixed that) */
+                        qcow2_free_any_clusters(bs, l2_entry, 1,
+                                QCOW2_DISCARD_ALWAYS);
+                    } else {
+                        fprintf(stderr, "ERROR data cluster 0x%llx overlaps "
+                                "with %s\n", cl_index,
+                                overlap_strings[ffs(ret) - 1]);
+                        result->corruptions++;
+                    }
+                } else if (ret < 0) {
+                    fprintf(stderr, "ERROR could not check data cluster 0x%llx "
+                            "overlap: %s\n", cl_index, strerror(-ret));
+                    result->check_errors++;
+                    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2);
+                    goto fail;
+                }
+            }
+
+            qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2);
+        }
+    }
+
+    ret = qcow2_check_metadata_overlap(bs, QCOW2_OL_MAIN_HEADER |
+            QCOW2_OL_ACTIVE_L1 | QCOW2_OL_SNAPSHOT_TABLE,
+            s->refcount_table_offset,
+            s->refcount_table_size * sizeof(uint64_t));
+    if (ret > 0) {
+        fprintf(stderr, "ERROR refcount table overlaps with %s\n",
+                overlap_strings[ffs(ret) - 1]);
+        result->corruptions++;
+        ret = 0;
+        goto fail;
+    } else if (ret < 0) {
+        fprintf(stderr, "ERROR could not check refcount table overlap: %s\n",
+                strerror(-ret));
+        result->check_errors++;
+        goto fail;
+    }
+
+    if (!s->refcount_table) {
+        fprintf(stderr, "ERROR could not check refcount block overlap: "
+                "refcount table not loaded\n");
+        result->check_errors++;
+        ret = -EIO;
+        goto fail;
+    }
+
+    /* refcount blocks have already been checked (when creating the bitmap) */
+
+    ret = qcow2_check_metadata_overlap(bs, QCOW2_OL_MAIN_HEADER |
+            QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE, s->snapshots_offset,
+            s->snapshots_size);
+    if (ret > 0) {
+        fprintf(stderr, "ERROR snapshot table overlaps with %s\n",
+                overlap_strings[ffs(ret) - 1]);
+        result->corruptions++;
+        ret = 0;
+        goto fail;
+    } else if (ret < 0) {
+        fprintf(stderr, "ERROR could not check snapshot table overlap: %s\n",
+                strerror(-ret));
+        result->check_errors++;
+        goto fail;
+    }
+
+    /* We now know for sure that nothing overlaps with the inactive L1 tables -
+     * therefore, we can skip explicitly checking them. */
+
+    ret = 0;
+
+fail:
+    g_free(bm);
+
+    if (fix & BDRV_FIX_ERRORS) {
+        /* qemu-img check will double check if some corruptions could be fixed
+         * - without reopening the underlying image file. Since the refcount
+         * check doesn't use the L2 cache for some reason, it must be flushed
+         * to disk before the second check can commence (else it will report
+         * inexistent errors); and if we're already at it, the refcount cache
+         * may be flushed just as well */
+        qcow2_cache_flush(bs, s->l2_table_cache);
+        qcow2_cache_flush(bs, s->refcount_block_cache);
+    }
+
+    return ret;
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index 95497c6..0b0f0ac 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -311,8 +311,19 @@  static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
         return ret;
     }
 
-    if (fix && result->check_errors == 0 && result->corruptions == 0) {
-        return qcow2_mark_clean(bs);
+    ret = qcow2_check_allocations(bs, result, fix);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (result->check_errors == 0 && result->corruptions == 0) {
+        if (fix) {
+            ret = qcow2_mark_clean(bs);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        return qcow2_mark_consistent(bs);
     }
     return ret;
 }
diff --git a/block/qcow2.h b/block/qcow2.h
index 8a55da0..3772ab6 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -410,6 +410,8 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                           BdrvCheckMode fix);
+int qcow2_check_allocations(BlockDriverState *bs, BdrvCheckResult *result,
+                            BdrvCheckMode fix);
 
 void qcow2_process_discards(BlockDriverState *bs, int ret);