Patchwork [4/5] qcow2: Check allocations in qcow2_check

login
register
mail settings
Submitter Max Reitz
Date Aug. 26, 2013, 10:18 a.m.
Message ID <1377512306-27382-5-git-send-email-mreitz@redhat.com>
Download mbox | patch
Permalink /patch/269887/
State New
Headers show

Comments

Max Reitz - Aug. 26, 2013, 10:18 a.m.
Adds a new function checking for overlapping cluster allocations.
Furthermore, qcow2_check now marks the image as consistent if no
corruptions have been found.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c | 414 ++++++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.c         |  15 +-
 block/qcow2.h         |   2 +
 3 files changed, 429 insertions(+), 2 deletions(-)

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index be35983..ea7d334 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1499,3 +1499,417 @@  fail:
 
     return ret;
 }
+
+static const char *const overlap_strings[8] = {
+    "main header",
+    "active L1 table",
+    "active L2 table",
+    "refcount table",
+    "refcount block",
+    "snapshot table",
+    "inactive L1 table",
+    "inactive L2 table"
+};
+
+/*
+ * Checks the table and cluster allocations for inconsistencies.
+ */
+int qcow2_check_allocations(BlockDriverState *bs, BdrvCheckResult *result,
+                            BdrvCheckMode fix)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+    int i, j;
+    uint8_t *bm; /* bitmap */
+    /* bitmap byte shift (bytes per bitmap-bit = 1 << bm_shift) */
+    int bm_shift = s->cluster_bits;
+    uint64_t bm_size;
+
+    /* Checking every data cluster through qcow2_check_metadata_overlap takes a
+     * considerable amount of time; a bitmap would help out very much, although
+     * it quickly assumes ridiculous sizes if not limited; therefore we're
+     * grouping several clusters together to not exceed the size of the L1 table
+     * in RAM (which seems a reasonable limit). */
+    while ((bm_size = bs->total_sectors >> (bm_shift++ - BDRV_SECTOR_BITS)) >
+        s->l1_size * sizeof(uint64_t));
+
+    bm = g_malloc0(++bm_size);
+
+    bm[0] = 1; /* main header */
+    /* active L1 */
+    for (i = 0; i < s->l1_size * sizeof(uint64_t) >> bm_shift; i++) {
+        uintptr_t bit = i + (s->l1_table_offset >> bm_shift);
+        if (bit / 8 >= bm_size) {
+            fprintf(stderr, "ERROR L1 table offset is after EOF\n");
+            result->corruptions++;
+            /* this is such a fundamental error that it will probably crash
+             * qcow2_check_metadata_overlap if this function doesn't return
+             * already here (qcow2_check_metadata_overlap is for keeping
+             * consistency, therefore it always assumes a pretty consistent
+             * image) */
+            ret = 0;
+            goto fail;
+        }
+        bm[bit / 8] |= 1 << (bit % 8);
+    }
+
+    if (s->l1_table) {
+        /* active L2 */
+        for (i = 0; i < s->l1_size; i++) {
+            uintptr_t bit = (s->l1_table[i] & L1E_OFFSET_MASK) >> bm_shift;
+            if (bit / 8 >= bm_size) {
+                fprintf(stderr, "ERROR L2 table %i offset is after EOF\n", i);
+                result->corruptions++;
+                ret = 0;
+                goto fail;
+            }
+            /* the test for j != 0 is unneccessary, since that bit is set
+             * anyway (because of the main header) */
+            bm[bit / 8] |= 1 << (bit % 8);
+        }
+    }
+
+    /* snapshot table */
+    for (i = 0; i < s->snapshots_size >> bm_shift; i++) {
+        uintptr_t bit = i + (s->snapshots_offset >> bm_shift);
+        if (bit / 8 >= bm_size) {
+            fprintf(stderr, "ERROR snapshot table offset is after EOF\n");
+            result->corruptions++;
+            ret = 0;
+            goto fail;
+        }
+        bm[bit / 8] |= 1 << (bit % 8);
+    }
+
+    if (s->snapshots) {
+        /* inactive L1 */
+        for (i = 0; i < s->nb_snapshots; i++) {
+            for (j = 0; j < s->snapshots[i].l1_size * sizeof(uint64_t) >>
+                bm_shift; j++) {
+                uintptr_t bit =
+                        j + (s->snapshots[i].l1_table_offset >> bm_shift);
+                if (bit / 8 >= bm_size) {
+                    fprintf(stderr, "ERROR inactive L1 table %i offset is "
+                            "after EOF\n", i);
+                    result->corruptions++;
+                    ret = 0;
+                    goto fail;
+                }
+                bm[bit / 8] |= 1 << (bit % 8);
+            }
+        }
+    }
+
+    /* refcount table */
+    for (i = 0; i < s->refcount_table_size * sizeof(uint64_t) >> bm_shift; i++)
+    {
+        uintptr_t bit = i + (s->refcount_table_offset >> bm_shift);
+        if (bit / 8 >= bm_size) {
+            fprintf(stderr, "ERROR refcount table offset is after EOF\n");
+            result->corruptions++;
+            ret = 0;
+            goto fail;
+        }
+        bm[bit / 8] |= 1 << (bit % 8);
+    }
+
+    if (s->refcount_table) {
+        int initial_corruptions = result->corruptions;
+
+        /* Do the refcount blocks last; that way, we can directly use the bitmap
+         * to check for overlaps */
+        for (i = 0; i < s->refcount_table_size; i++) {
+            uint64_t rb_offset = s->refcount_table[i] & REFT_OFFSET_MASK;
+            uintptr_t bit = rb_offset >> bm_shift;
+
+            if (!rb_offset) {
+                /* unallocated */
+                continue;
+            }
+
+            if (rb_offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR refcount block %i is not cluster "
+                        "aligned in image\n", i);
+                result->corruptions++;
+                continue;
+            }
+
+            if (bit / 8 >= bm_size) {
+                fprintf(stderr, "ERROR refcount block %i offset is after EOF\n",
+                        i);
+                result->corruptions++;
+                ret = 0;
+                goto fail;
+            }
+
+            if (bm[bit / 8] & (1 << (bit % 8))) {
+                ret = qcow2_check_metadata_overlap(bs,
+                        QCOW2_OL_CACHED & ~QCOW2_OL_REFCOUNT_BLOCK, rb_offset,
+                        s->cluster_size);
+                if (ret > 0) {
+                    fprintf(stderr, "ERROR refcount block %i overlaps with %s\n", i,
+                            overlap_strings[ffs(ret) - 1]);
+                    result->corruptions++;
+                } else if (ret < 0) {
+                    fprintf(stderr, "ERROR could not check refcount block %i overlap: "
+                            "%s\n", i, strerror(-ret));
+                    result->check_errors++;
+                    goto fail;
+                }
+            }
+        }
+
+        if (result->corruptions != initial_corruptions) {
+            ret = 0;
+            goto fail;
+        }
+
+        /* Now, enter the refcount blocks into the bitmap */
+        for (i = 0; i < s->refcount_table_size; i++) {
+            uint64_t rb_offset = s->refcount_table[i] & REFT_OFFSET_MASK;
+            uintptr_t bit = rb_offset >> bm_shift;
+            bm[bit / 8] |= 1 << (bit % 8);
+        }
+    }
+
+    ret = qcow2_check_metadata_overlap(bs, QCOW2_OL_MAIN_HEADER |
+            QCOW2_OL_REFCOUNT_TABLE | QCOW2_OL_SNAPSHOT_TABLE,
+            s->l1_table_offset, s->l1_size * sizeof(uint64_t));
+    if (ret > 0) {
+        fprintf(stderr, "ERROR L1 offset intersects %s\n",
+                overlap_strings[ffs(ret) - 1]);
+        result->corruptions++;
+        ret = 0;
+        goto fail;
+    } else if (ret < 0) {
+        fprintf(stderr, "ERROR could not check L1 overlap: %s\n",
+                strerror(-ret));
+        result->check_errors++;
+        goto fail;
+    }
+
+    if (s->l1_table) {
+        for (i = 0; i < s->l1_size; i++) {
+            uint64_t l2_offset = s->l1_table[i] & L1E_OFFSET_MASK;
+            uint64_t *l2;
+
+            if (!l2_offset) {
+                /* unallocated */
+                continue;
+            }
+
+            if (l2_offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR L2 %i is not cluster aligned; L1 table "
+                        "entry corrupted\n", i);
+                result->corruptions++;
+                continue;
+            }
+
+            ret = qcow2_check_metadata_overlap(bs,
+                    QCOW2_OL_CACHED & ~QCOW2_OL_ACTIVE_L2, l2_offset,
+                    s->cluster_size);
+            if (ret > 0) {
+                fprintf(stderr, "ERROR L2 %i overlaps with %s\n", i,
+                        overlap_strings[ffs(ret) - 1]);
+                result->corruptions++;
+                continue;
+            } else if (ret < 0) {
+                fprintf(stderr, "ERROR could not check L2 %i overlap: %s\n", i,
+                        strerror(-ret));
+                result->check_errors++;
+                goto fail;
+            }
+
+            ret = l2_load(bs, l2_offset, &l2);
+            if ((ret < 0) || !l2) {
+                fprintf(stderr, "ERROR could not load L2 %i: %s\n", i,
+                        strerror(-ret));
+                result->check_errors++;
+                if (ret < 0) {
+                    goto fail;
+                } else {
+                    continue;
+                }
+            }
+
+            for (j = 0; j < s->l2_size; j++) {
+                uint64_t cl_offset, cl_size;
+                unsigned long long cl_index;
+                uint64_t l2_entry = be64_to_cpu(l2[j]);
+
+                if ((s->qcow_version >= 3) && (l2_entry & 1)) {
+                    /* cluster reads as all zeroes */
+                    continue;
+                }
+
+                if (l2_entry & (1ULL << 62)) { /* compressed */
+                    int xp1 = 62 - (s->cluster_bits - 9);
+                    cl_offset = l2_entry & ((1 << xp1) - 1);
+                    cl_size = ((l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK) >>
+                            xp1) * BDRV_SECTOR_SIZE;
+                } else { /* uncompressed */
+                    cl_offset = l2_entry & L2E_OFFSET_MASK;
+                    cl_size = s->cluster_size;
+                }
+
+                if (!cl_offset) {
+                    /* unallocated */
+                    continue;
+                }
+
+                cl_index = ((unsigned long long)i << s->l2_bits) | j;
+
+                if (!(l2_entry & (1ULL << 62)) &&
+                    cl_offset & (s->cluster_size - 1)) {
+                    fprintf(stderr, "ERROR data cluster 0x%llx is not cluster "
+                            "aligned in image\n", cl_index);
+                    result->corruptions++;
+                    continue;
+                }
+
+                if (!(bm[(cl_offset >> bm_shift) / 8] &
+                    (1 << ((cl_offset >> bm_shift) % 8)))) {
+                    continue;
+                }
+
+                ret = qcow2_check_metadata_overlap(bs, QCOW2_OL_CACHED, cl_offset,
+                                                   cl_size);
+                if (ret > 0) {
+                    /* TODO: fix compressed clusters */
+                    if ((fix & BDRV_FIX_ERRORS) && !(l2_entry & (1ULL << 62))) {
+                        void *content = g_malloc(s->cluster_size);
+                        uint64_t sector = cl_index <<
+                                (s->cluster_bits - BDRV_SECTOR_BITS);
+
+                        fprintf(stderr, "Reallocating data cluster 0x%llx "
+                                "(currently overlapping with %s)\n", cl_index,
+                                overlap_strings[ffs(ret) - 1]);
+
+                        ret = bdrv_read(bs, sector, content,
+                                        s->cluster_sectors);
+                        if (ret < 0) {
+                            fprintf(stderr, "ERROR could not read cluster: "
+                                    "%s\n", strerror(-ret));
+                            result->corruptions++;
+                            g_free(content);
+                            qcow2_cache_put(bs, s->l2_table_cache,
+                                            (void **)&l2);
+                            goto fail;
+                        }
+
+                        /* unallocate cluster (but leaving the refcount for
+                         * now) */
+                        l2[j] = 0;
+
+                        /* FIXME: the image obviously is inconsistent, so this
+                         * write access may trigger new inconsistencies
+                         * (although the refcount check, which is supposed to be
+                         * performed before these checks, should take care of
+                         * this) */
+                        ret = bdrv_write(bs, sector, content,
+                                         s->cluster_sectors);
+                        if (ret < 0) {
+                            /* at least leave it as it was */
+                            l2[j] = cpu_to_be64(l2_entry);
+
+                            fprintf(stderr, "ERROR could not write cluster: "
+                                    "%s\n", strerror(-ret));
+                            result->corruptions++;
+                            g_free(content);
+                            qcow2_cache_put(bs, s->l2_table_cache,
+                                    (void **)&l2);
+                            goto fail;
+                        }
+
+                        g_free(content);
+                        result->corruptions_fixed++;
+
+                        /* Now decrement the refcount (although an invalid
+                         * refcount might have been the problem in the first
+                         * place, the refcount check (which is running before
+                         * this) will have fixed that) */
+                        qcow2_free_any_clusters(bs, l2_entry, 1,
+                                QCOW2_DISCARD_ALWAYS);
+                    } else {
+                        fprintf(stderr, "ERROR data cluster 0x%llx overlaps "
+                                "with %s\n", cl_index,
+                                overlap_strings[ffs(ret) - 1]);
+                        result->corruptions++;
+                    }
+                } else if (ret < 0) {
+                    fprintf(stderr, "ERROR could not check data cluster 0x%llx "
+                            "overlap: %s\n", cl_index, strerror(-ret));
+                    result->check_errors++;
+                    qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2);
+                    goto fail;
+                }
+            }
+
+            qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2);
+        }
+    }
+
+    ret = qcow2_check_metadata_overlap(bs, QCOW2_OL_MAIN_HEADER |
+            QCOW2_OL_ACTIVE_L1 | QCOW2_OL_SNAPSHOT_TABLE,
+            s->refcount_table_offset,
+            s->refcount_table_size * sizeof(uint64_t));
+    if (ret > 0) {
+        fprintf(stderr, "ERROR refcount table overlaps with %s\n",
+                overlap_strings[ffs(ret) - 1]);
+        result->corruptions++;
+        ret = 0;
+        goto fail;
+    } else if (ret < 0) {
+        fprintf(stderr, "ERROR could not check refcount table overlap: %s\n",
+                strerror(-ret));
+        result->check_errors++;
+        goto fail;
+    }
+
+    if (!s->refcount_table) {
+        fprintf(stderr, "ERROR could not check refcount block overlap: "
+                "refcount table not loaded\n");
+        result->check_errors++;
+        ret = -EIO;
+        goto fail;
+    }
+
+    /* refcount blocks have already been checked (when creating the bitmap) */
+
+    ret = qcow2_check_metadata_overlap(bs, QCOW2_OL_MAIN_HEADER |
+            QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE, s->snapshots_offset,
+            s->snapshots_size);
+    if (ret > 0) {
+        fprintf(stderr, "ERROR snapshot table overlaps with %s\n",
+                overlap_strings[ffs(ret) - 1]);
+        result->corruptions++;
+        ret = 0;
+        goto fail;
+    } else if (ret < 0) {
+        fprintf(stderr, "ERROR could not check snapshot table overlap: %s\n",
+                strerror(-ret));
+        result->check_errors++;
+        goto fail;
+    }
+
+    /* We now know for sure that nothing overlaps with the inactive L1 tables -
+     * therefore, we can skip explicitly checking them. */
+
+    ret = 0;
+
+fail:
+    g_free(bm);
+
+    if (fix & BDRV_FIX_ERRORS) {
+        /* qemu-img check will double check if some corruptions could be fixed
+         * - without reopening the underlying image file. Since the refcount
+         * check doesn't use the L2 cache for some reason, it must be flushed
+         * to disk before the second check can commence (else it will report
+         * inexistent errors); and if we're already at it, the refcount cache
+         * may be flushed just as well */
+        qcow2_cache_flush(bs, s->l2_table_cache);
+        qcow2_cache_flush(bs, s->refcount_block_cache);
+    }
+
+    return ret;
+}
diff --git a/block/qcow2.c b/block/qcow2.c
index 95497c6..0b0f0ac 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -311,8 +311,19 @@  static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
         return ret;
     }
 
-    if (fix && result->check_errors == 0 && result->corruptions == 0) {
-        return qcow2_mark_clean(bs);
+    ret = qcow2_check_allocations(bs, result, fix);
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (result->check_errors == 0 && result->corruptions == 0) {
+        if (fix) {
+            ret = qcow2_mark_clean(bs);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        return qcow2_mark_consistent(bs);
     }
     return ret;
 }
diff --git a/block/qcow2.h b/block/qcow2.h
index 8a55da0..3772ab6 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -410,6 +410,8 @@  int qcow2_update_snapshot_refcount(BlockDriverState *bs,
 
 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
                           BdrvCheckMode fix);
+int qcow2_check_allocations(BlockDriverState *bs, BdrvCheckResult *result,
+                            BdrvCheckMode fix);
 
 void qcow2_process_discards(BlockDriverState *bs, int ret);