diff mbox series

[v2,2/3] parallels: Add checking and repairing duplicate offsets in BAT

Message ID 20220805154752.799395-3-alexander.ivanov@virtuozzo.com
State New
Headers show
Series Check and repair duplicated clusters in parallels images | expand

Commit Message

Alexander Ivanov Aug. 5, 2022, 3:47 p.m. UTC
From: Alexander Ivanov <alexander.ivanov@virtuozzo.com>

There could be corruptions in the image file:
two guest memory areas refer to the same host cluster.

If a duplicate offset is found fix it by copying the content
of the referred cluster to a new allocated cluster and
replace one of the two referring entries by the new cluster offset.

Signed-off-by: Natalia Kuzmina <natalia.kuzmina@openvz.org>
Signed-off-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
---
 block/parallels.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

Comments

Vladimir Sementsov-Ogievskiy Aug. 6, 2022, 8:45 p.m. UTC | #1
On 8/5/22 18:47, alexander.ivanov@virtuozzo.com wrote:
> From: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
> 
> There could be corruptions in the image file:
> two guest memory areas refer to the same host cluster.

Is that written in parallels spec (docs/interop/parallels.txt)?

Hmm yes: "- the value must be unique among all BAT entries".

> 
> If a duplicate offset is found fix it by copying the content
> of the referred cluster to a new allocated cluster and
> replace one of the two referring entries by the new cluster offset.
> 
> Signed-off-by: Natalia Kuzmina <natalia.kuzmina@openvz.org>
> Signed-off-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
> ---
>   block/parallels.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 135 insertions(+)
> 
> diff --git a/block/parallels.c b/block/parallels.c
> index a0eb7ec3c3..73264b4bd1 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -64,6 +64,11 @@ static QEnumLookup prealloc_mode_lookup = {
>   #define PARALLELS_OPT_PREALLOC_MODE     "prealloc-mode"
>   #define PARALLELS_OPT_PREALLOC_SIZE     "prealloc-size"
>   
> +#define REVERSED_BAT_UNTOUCHED  0xFFFFFFFF
> +
> +#define HOST_CLUSTER_INDEX(s, off) \
> +    ((off - ((s->header->data_off) << BDRV_SECTOR_BITS)) / (s->cluster_size))

Let it be a static function, not a macro.

> +
>   static QemuOptsList parallels_runtime_opts = {
>       .name = "parallels",
>       .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head),
> @@ -559,6 +564,131 @@ static int check_leak(BlockDriverState *bs,
>       return 0;
>   }
>   
> +static int check_duplicate(BlockDriverState *bs,
> +                           BdrvCheckResult *res,
> +                           BdrvCheckMode fix)
> +{
> +    BDRVParallelsState *s = bs->opaque;
> +    QEMUIOVector qiov;
> +    int64_t off, high_off, size, sector_num;
> +    uint32_t i, idx_host;
> +    int ret = 0, n;
> +    g_autofree uint32_t *reversed_bat = NULL;
> +    g_autofree int64_t *cluster_buf = NULL;
> +    bool sync_and_truncate = false;
> +
> +    /*
> +     * Make a reversed BAT.
> +     * Each cluster in the host file could represent only one cluster
> +     * from the guest point of view. Reversed BAT provides mapping of that type.
> +     * Initially the table is filled with REVERSED_BAT_UNTOUCHED values.
> +     */
> +    reversed_bat = g_malloc(s->bat_size * sizeof(uint32_t));

Hmm. Why size of reversed_bat equal to bat_size? Couldn't host file size be larger than that?

Seems that we want calculate the highest offset first, and then allocate corresponding table.

Also, here we probably allocate too much memory. Better use g_try_malloc and clean error-out instead of crash.

> +    for (i = 0; i < s->bat_size; i++) {
> +        reversed_bat[i] = REVERSED_BAT_UNTOUCHED;
> +    }
> +
> +    cluster_buf = g_malloc(s->cluster_size);
> +    qemu_iovec_init(&qiov, 0);
> +    qemu_iovec_add(&qiov, cluster_buf, s->cluster_size);
> +
> +    high_off = 0;
> +    for (i = 0; i < s->bat_size; i++) {
> +        off = bat2sect(s, i) << BDRV_SECTOR_BITS;
> +        if (off == 0) {
> +            continue;
> +        }
> +
> +        if (off > high_off) {
> +            high_off = off;
> +        }
> +
> +        idx_host = HOST_CLUSTER_INDEX(s, off);
> +        if (idx_host >= s->bat_size) {
> +            res->check_errors++;

As I understand, check_errors is mostly for IO errors during the check.

If it's incorrect for parallels format to have such cluster, we want res->corruptions++ here instead.

But is it really incorrect, what the spec say?

> +            goto out;
> +        }
> +
> +        if (reversed_bat[idx_host] != REVERSED_BAT_UNTOUCHED) {
> +            /*
> +             * We have alreade written a guest cluster index for this

already

> +             * host cluster. It means there is a duplicated offset in BAT.
> +             */
> +            fprintf(stderr,
> +                    "%s BAT offset in entry %u duplicates offset in entry %u\n",
> +                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
> +                    i, reversed_bat[idx_host]);
> +            res->corruptions++;
> +
> +            if (fix & BDRV_FIX_ERRORS) {
> +                /*
> +                 * Write zero to the current BAT entry, allocate a new cluster
> +                 * for the relevant guest offset and copy the host cluster
> +                 * to the new allocated cluster.
> +                 * In this way mwe will have two identical clusters and two
> +                 * BAT entries with the offsets of these clusters.
> +                 */
> +                s->bat_bitmap[i] = 0;

I don't understand that. So you just remove that guest cluster. Shouldn't we instead set it to new allocated off?

> +
> +                sector_num = bat2sect(s, reversed_bat[idx_host]);
> +                ret = bdrv_pread(bs->file, sector_num << BDRV_SECTOR_BITS,
> +                                 s->cluster_size, cluster_buf, 0);
> +                if (ret < 0) {
> +                    res->check_errors++;
> +                    goto out;
> +                }
> +
> +                sector_num = (i * s->cluster_size) >> BDRV_SECTOR_BITS;
> +                off = allocate_clusters(bs, sector_num, s->tracks, &n);

you probably want to update high_off here

> +                if (off < 0) {
> +                    res->check_errors++;
> +                    ret = off;
> +                    goto out;
> +                }
> +                off <<= BDRV_SECTOR_BITS;
> +
> +                ret = bdrv_co_pwritev(bs->file, off, s->cluster_size, &qiov, 0);
> +                if (ret < 0) {
> +                    res->check_errors++;
> +                    goto out;
> +                }
> +
> +                /* off is new and we should repair idx_host accordingly. */
> +                idx_host = HOST_CLUSTER_INDEX(s, off);
> +                res->corruptions_fixed++;
> +                sync_and_truncate = true;
> +            }
> +        }
> +        reversed_bat[idx_host] = i;
> +    }
> +
> +    if (sync_and_truncate) {
> +        ret = sync_header(bs, res);
> +        if (ret < 0) {
> +            goto out;
> +        }
> +
> +        size = bdrv_getlength(bs->file->bs);
> +        if (size < 0) {
> +            res->check_errors++;
> +            ret = size;
> +            goto out;
> +        }
> +
> +        res->image_end_offset = high_off + s->cluster_size;
> +        if (size > res->image_end_offset) {
> +            ret = truncate_file(bs, res->image_end_offset);

that's already done in check_leak, why we need it again?

> +            if (ret < 0) {
> +                goto out;
> +            }
> +        }
> +    }
> +
> +out:
> +    qemu_iovec_destroy(&qiov);
> +    return ret;
> +}
> +
>   static void collect_statistics(BlockDriverState *bs, BdrvCheckResult *res)
>   {
>       BDRVParallelsState *s = bs->opaque;
> @@ -598,6 +728,11 @@ static int coroutine_fn parallels_co_check(BlockDriverState *bs,
>           goto out;
>       }
>   
> +    ret = check_duplicate(bs, res, fix);
> +    if (ret != 0) {
> +        goto out;
> +    }
> +
>       collect_statistics(bs, res);
>   
>   out:
Denis V. Lunev Aug. 6, 2022, 9:07 p.m. UTC | #2
On 06.08.2022 22:45, Vladimir Sementsov-Ogievskiy wrote:
> On 8/5/22 18:47, alexander.ivanov@virtuozzo.com wrote:
>> From: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
>>
>> There could be corruptions in the image file:
>> two guest memory areas refer to the same host cluster.
>
> Is that written in parallels spec (docs/interop/parallels.txt)?
>
> Hmm yes: "- the value must be unique among all BAT entries".
>
>>
>> If a duplicate offset is found fix it by copying the content
>> of the referred cluster to a new allocated cluster and
>> replace one of the two referring entries by the new cluster offset.
>>
>> Signed-off-by: Natalia Kuzmina <natalia.kuzmina@openvz.org>
>> Signed-off-by: Alexander Ivanov <alexander.ivanov@virtuozzo.com>
>> ---
>>   block/parallels.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 135 insertions(+)
>>
>> diff --git a/block/parallels.c b/block/parallels.c
>> index a0eb7ec3c3..73264b4bd1 100644
>> --- a/block/parallels.c
>> +++ b/block/parallels.c
>> @@ -64,6 +64,11 @@ static QEnumLookup prealloc_mode_lookup = {
>>   #define PARALLELS_OPT_PREALLOC_MODE     "prealloc-mode"
>>   #define PARALLELS_OPT_PREALLOC_SIZE     "prealloc-size"
>>   +#define REVERSED_BAT_UNTOUCHED  0xFFFFFFFF
>> +
>> +#define HOST_CLUSTER_INDEX(s, off) \
>> +    ((off - ((s->header->data_off) << BDRV_SECTOR_BITS)) / 
>> (s->cluster_size))
>
> Let it be a static function, not a macro.
>
>> +
>>   static QemuOptsList parallels_runtime_opts = {
>>       .name = "parallels",
>>       .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head),
>> @@ -559,6 +564,131 @@ static int check_leak(BlockDriverState *bs,
>>       return 0;
>>   }
>>   +static int check_duplicate(BlockDriverState *bs,
>> +                           BdrvCheckResult *res,
>> +                           BdrvCheckMode fix)
>> +{
>> +    BDRVParallelsState *s = bs->opaque;
>> +    QEMUIOVector qiov;
>> +    int64_t off, high_off, size, sector_num;
>> +    uint32_t i, idx_host;
>> +    int ret = 0, n;
>> +    g_autofree uint32_t *reversed_bat = NULL;
>> +    g_autofree int64_t *cluster_buf = NULL;
>> +    bool sync_and_truncate = false;
>> +
>> +    /*
>> +     * Make a reversed BAT.
>> +     * Each cluster in the host file could represent only one cluster
>> +     * from the guest point of view. Reversed BAT provides mapping 
>> of that type.
>> +     * Initially the table is filled with REVERSED_BAT_UNTOUCHED 
>> values.
>> +     */
>> +    reversed_bat = g_malloc(s->bat_size * sizeof(uint32_t));
>
> Hmm. Why size of reversed_bat equal to bat_size? Couldn't host file 
> size be larger than that?
>
> Seems that we want calculate the highest offset first, and then 
> allocate corresponding table.
>
> Also, here we probably allocate too much memory. Better use 
> g_try_malloc and clean error-out instead of crash.
>

This is incorrect. reversed_bat has __different__ size than BAT.
Technically we should take into account real file length and
use it as a size. Even if all blocks are allocated, the size of
the file is __greater__ than size of the payload by BAT + header
sizes.

But this is even trickier. The length could grow once we
copy duplicated clusters.

Also we are fucked up a bit with this way a little bit. We can have
first cluster just after the BAT (not aligned to cluster) or
with a padding, i.e. aligned to cluster size. This means that
data_off could be zero and non-zero.

I have missed this 3 times already :(

>> +    for (i = 0; i < s->bat_size; i++) {
>> +        reversed_bat[i] = REVERSED_BAT_UNTOUCHED;
>> +    }
>> +
>> +    cluster_buf = g_malloc(s->cluster_size);
>> +    qemu_iovec_init(&qiov, 0);
>> +    qemu_iovec_add(&qiov, cluster_buf, s->cluster_size);
>> +
>> +    high_off = 0;
>> +    for (i = 0; i < s->bat_size; i++) {
>> +        off = bat2sect(s, i) << BDRV_SECTOR_BITS;
>> +        if (off == 0) {
>> +            continue;
>> +        }
>> +
>> +        if (off > high_off) {
>> +            high_off = off;
>> +        }
>> +
>> +        idx_host = HOST_CLUSTER_INDEX(s, off);
>> +        if (idx_host >= s->bat_size) {
>> +            res->check_errors++;
>
> As I understand, check_errors is mostly for IO errors during the check.
>
> If it's incorrect for parallels format to have such cluster, we want 
> res->corruptions++ here instead.
>
> But is it really incorrect, what the spec say?
>
Common sense says that if BAT points outside of the real
host file that cluster is really not existing and should be
read as zeroes. This is what and how we are fixing here.


>> +            goto out;
>> +        }
>> +
>> +        if (reversed_bat[idx_host] != REVERSED_BAT_UNTOUCHED) {
>> +            /*
>> +             * We have alreade written a guest cluster index for this
>
> already
>
>> +             * host cluster. It means there is a duplicated offset 
>> in BAT.
>> +             */
>> +            fprintf(stderr,
>> +                    "%s BAT offset in entry %u duplicates offset in 
>> entry %u\n",
>> +                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
>> +                    i, reversed_bat[idx_host]);
>> +            res->corruptions++;
>> +
>> +            if (fix & BDRV_FIX_ERRORS) {
>> +                /*
>> +                 * Write zero to the current BAT entry, allocate a 
>> new cluster
>> +                 * for the relevant guest offset and copy the host 
>> cluster
>> +                 * to the new allocated cluster.
>> +                 * In this way mwe will have two identical clusters 
>> and two
>> +                 * BAT entries with the offsets of these clusters.
>> +                 */
>> +                s->bat_bitmap[i] = 0;
>
> I don't understand that. So you just remove that guest cluster. 
> Shouldn't we instead set it to new allocated off?
>
we will write it again below and new offset will be allocated
properly within alloc_cluster()

>> +
>> +                sector_num = bat2sect(s, reversed_bat[idx_host]);
>> +                ret = bdrv_pread(bs->file, sector_num << 
>> BDRV_SECTOR_BITS,
>> +                                 s->cluster_size, cluster_buf, 0);
>> +                if (ret < 0) {
>> +                    res->check_errors++;
>> +                    goto out;
>> +                }
>> +
>> +                sector_num = (i * s->cluster_size) >> BDRV_SECTOR_BITS;
>> +                off = allocate_clusters(bs, sector_num, s->tracks, &n);
>
> you probably want to update high_off here
>
statistics should be calculated after all fixes. That should be more 
correct.


>> +                if (off < 0) {
>> +                    res->check_errors++;
>> +                    ret = off;
>> +                    goto out;
>> +                }
>> +                off <<= BDRV_SECTOR_BITS;
>> +
>> +                ret = bdrv_co_pwritev(bs->file, off, 
>> s->cluster_size, &qiov, 0);
>> +                if (ret < 0) {
>> +                    res->check_errors++;
>> +                    goto out;
>> +                }
>> +
>> +                /* off is new and we should repair idx_host 
>> accordingly. */
>> +                idx_host = HOST_CLUSTER_INDEX(s, off);
>> +                res->corruptions_fixed++;
>> +                sync_and_truncate = true;
>> +            }
>> +        }
>> +        reversed_bat[idx_host] = i;
>> +    }
>> +
>> +    if (sync_and_truncate) {
>> +        ret = sync_header(bs, res);
>> +        if (ret < 0) {
>> +            goto out;
>> +        }
>> +
>> +        size = bdrv_getlength(bs->file->bs);
>> +        if (size < 0) {
>> +            res->check_errors++;
>> +            ret = size;
>> +            goto out;
>> +        }
>> +
>> +        res->image_end_offset = high_off + s->cluster_size;
>> +        if (size > res->image_end_offset) {
>> +            ret = truncate_file(bs, res->image_end_offset);
>
> that's already done in check_leak, why we need it again?
>
>> +            if (ret < 0) {
>> +                goto out;
>> +            }
>> +        }
>> +    }
>> +
>> +out:
>> +    qemu_iovec_destroy(&qiov);
>> +    return ret;
>> +}
>> +
>>   static void collect_statistics(BlockDriverState *bs, 
>> BdrvCheckResult *res)
>>   {
>>       BDRVParallelsState *s = bs->opaque;
>> @@ -598,6 +728,11 @@ static int coroutine_fn 
>> parallels_co_check(BlockDriverState *bs,
>>           goto out;
>>       }
>>   +    ret = check_duplicate(bs, res, fix);
>> +    if (ret != 0) {
>> +        goto out;
>> +    }
>> +
>>       collect_statistics(bs, res);
>>     out:
>
>
diff mbox series

Patch

diff --git a/block/parallels.c b/block/parallels.c
index a0eb7ec3c3..73264b4bd1 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -64,6 +64,11 @@  static QEnumLookup prealloc_mode_lookup = {
 #define PARALLELS_OPT_PREALLOC_MODE     "prealloc-mode"
 #define PARALLELS_OPT_PREALLOC_SIZE     "prealloc-size"
 
+#define REVERSED_BAT_UNTOUCHED  0xFFFFFFFF
+
+#define HOST_CLUSTER_INDEX(s, off) \
+    ((off - ((s->header->data_off) << BDRV_SECTOR_BITS)) / (s->cluster_size))
+
 static QemuOptsList parallels_runtime_opts = {
     .name = "parallels",
     .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head),
@@ -559,6 +564,131 @@  static int check_leak(BlockDriverState *bs,
     return 0;
 }
 
+static int check_duplicate(BlockDriverState *bs,
+                           BdrvCheckResult *res,
+                           BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    QEMUIOVector qiov;
+    int64_t off, high_off, size, sector_num;
+    uint32_t i, idx_host;
+    int ret = 0, n;
+    g_autofree uint32_t *reversed_bat = NULL;
+    g_autofree int64_t *cluster_buf = NULL;
+    bool sync_and_truncate = false;
+
+    /*
+     * Make a reversed BAT.
+     * Each cluster in the host file could represent only one cluster
+     * from the guest point of view. Reversed BAT provides mapping of that type.
+     * Initially the table is filled with REVERSED_BAT_UNTOUCHED values.
+     */
+    reversed_bat = g_malloc(s->bat_size * sizeof(uint32_t));
+    for (i = 0; i < s->bat_size; i++) {
+        reversed_bat[i] = REVERSED_BAT_UNTOUCHED;
+    }
+
+    cluster_buf = g_malloc(s->cluster_size);
+    qemu_iovec_init(&qiov, 0);
+    qemu_iovec_add(&qiov, cluster_buf, s->cluster_size);
+
+    high_off = 0;
+    for (i = 0; i < s->bat_size; i++) {
+        off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+        if (off == 0) {
+            continue;
+        }
+
+        if (off > high_off) {
+            high_off = off;
+        }
+
+        idx_host = HOST_CLUSTER_INDEX(s, off);
+        if (idx_host >= s->bat_size) {
+            res->check_errors++;
+            goto out;
+        }
+
+        if (reversed_bat[idx_host] != REVERSED_BAT_UNTOUCHED) {
+            /*
+             * We have alreade written a guest cluster index for this
+             * host cluster. It means there is a duplicated offset in BAT.
+             */
+            fprintf(stderr,
+                    "%s BAT offset in entry %u duplicates offset in entry %u\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
+                    i, reversed_bat[idx_host]);
+            res->corruptions++;
+
+            if (fix & BDRV_FIX_ERRORS) {
+                /*
+                 * Write zero to the current BAT entry, allocate a new cluster
+                 * for the relevant guest offset and copy the host cluster
+                 * to the new allocated cluster.
+                 * In this way mwe will have two identical clusters and two
+                 * BAT entries with the offsets of these clusters.
+                 */
+                s->bat_bitmap[i] = 0;
+
+                sector_num = bat2sect(s, reversed_bat[idx_host]);
+                ret = bdrv_pread(bs->file, sector_num << BDRV_SECTOR_BITS,
+                                 s->cluster_size, cluster_buf, 0);
+                if (ret < 0) {
+                    res->check_errors++;
+                    goto out;
+                }
+
+                sector_num = (i * s->cluster_size) >> BDRV_SECTOR_BITS;
+                off = allocate_clusters(bs, sector_num, s->tracks, &n);
+                if (off < 0) {
+                    res->check_errors++;
+                    ret = off;
+                    goto out;
+                }
+                off <<= BDRV_SECTOR_BITS;
+
+                ret = bdrv_co_pwritev(bs->file, off, s->cluster_size, &qiov, 0);
+                if (ret < 0) {
+                    res->check_errors++;
+                    goto out;
+                }
+
+                /* off is new and we should repair idx_host accordingly. */
+                idx_host = HOST_CLUSTER_INDEX(s, off);
+                res->corruptions_fixed++;
+                sync_and_truncate = true;
+            }
+        }
+        reversed_bat[idx_host] = i;
+    }
+
+    if (sync_and_truncate) {
+        ret = sync_header(bs, res);
+        if (ret < 0) {
+            goto out;
+        }
+
+        size = bdrv_getlength(bs->file->bs);
+        if (size < 0) {
+            res->check_errors++;
+            ret = size;
+            goto out;
+        }
+
+        res->image_end_offset = high_off + s->cluster_size;
+        if (size > res->image_end_offset) {
+            ret = truncate_file(bs, res->image_end_offset);
+            if (ret < 0) {
+                goto out;
+            }
+        }
+    }
+
+out:
+    qemu_iovec_destroy(&qiov);
+    return ret;
+}
+
 static void collect_statistics(BlockDriverState *bs, BdrvCheckResult *res)
 {
     BDRVParallelsState *s = bs->opaque;
@@ -598,6 +728,11 @@  static int coroutine_fn parallels_co_check(BlockDriverState *bs,
         goto out;
     }
 
+    ret = check_duplicate(bs, res, fix);
+    if (ret != 0) {
+        goto out;
+    }
+
     collect_statistics(bs, res);
 
 out: