diff mbox

[08/27] block/parallels: _co_writev callback for Parallels format

Message ID 1426069701-1405-9-git-send-email-den@openvz.org
State New
Headers show

Commit Message

Denis V. Lunev March 11, 2015, 10:28 a.m. UTC
Support write on Parallels images. The code is almost the same as one
in the previous patch implemented scatter-gather IO for read.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Roman Kagan <rkagan@parallels.com>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/parallels.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 75 insertions(+), 2 deletions(-)

Comments

Denis V. Lunev April 22, 2015, 12:44 p.m. UTC | #1
On 11/03/15 13:28, Denis V. Lunev wrote:
> Support write on Parallels images. The code is almost the same as one
> in the previous patch implemented scatter-gather IO for read.
>
> Signed-off-by: Denis V. Lunev <den@openvz.org>
> Reviewed-by: Roman Kagan <rkagan@parallels.com>
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>   block/parallels.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 75 insertions(+), 2 deletions(-)
>
> diff --git a/block/parallels.c b/block/parallels.c
> index 306f2e3..61d7da7 100644
> --- a/block/parallels.c
> +++ b/block/parallels.c
> @@ -81,8 +81,6 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
>       ParallelsHeader ph;
>       int ret;
>   
> -    bs->read_only = 1; // no write support yet
> -
>       ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
>       if (ret < 0) {
>           goto fail;
> @@ -166,6 +164,37 @@ static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num,
>       return MIN(nb_sectors, ret);
>   }
>   
> +static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
> +{
> +    BDRVParallelsState *s = bs->opaque;
> +    uint32_t idx, offset, tmp;
> +    int64_t pos;
> +    int ret;
> +
> +    idx = sector_num / s->tracks;
> +    offset = sector_num % s->tracks;
> +
> +    if (idx >= s->catalog_size) {
> +        return -EINVAL;
> +    }
> +    if (s->catalog_bitmap[idx] != 0) {
> +        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
> +    }
> +
> +    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
> +    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
> +    s->catalog_bitmap[idx] = pos / s->off_multiplier;
> +
> +    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
> +
> +    ret = bdrv_pwrite_sync(bs->file,
> +            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));
> +    if (ret < 0) {
> +        return ret;
> +    }
> +    return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
> +}
> +
>   static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
>           int64_t sector_num, int nb_sectors, int *pnum)
>   {
> @@ -186,6 +215,49 @@ static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
>           BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
>   }
>   
> +static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVParallelsState *s = bs->opaque;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +
> +    qemu_co_mutex_lock(&s->lock);
> +    while (nb_sectors > 0) {
> +        int64_t position = allocate_cluster(bs, sector_num);
> +        int n = cluster_remainder(s, sector_num, nb_sectors);
> +        int nbytes = n << BDRV_SECTOR_BITS;
> +
> +        if (position < 0) {
> +            ret = (int)position;
> +            break;
> +        }
> +
> +        qemu_iovec_reset(&hd_qiov);
> +        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
> +
> +        qemu_co_mutex_unlock(&s->lock);
> +        ret = bdrv_co_writev(bs->file, position, n, &hd_qiov);
> +        qemu_co_mutex_lock(&s->lock);
> +
> +        if (ret < 0) {
> +            goto fail;
same problem with unlock as in patch 6


> +        }
> +
> +        nb_sectors -= n;
> +        sector_num += n;
> +        bytes_done += nbytes;
> +    }
> +    qemu_co_mutex_unlock(&s->lock);
> +
> +fail:
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
> +
>   static coroutine_fn int parallels_co_readv(BlockDriverState *bs,
>           int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
>   {
> @@ -242,6 +314,7 @@ static BlockDriver bdrv_parallels = {
>       .bdrv_close		= parallels_close,
>       .bdrv_co_get_block_status = parallels_co_get_block_status,
>       .bdrv_co_readv  = parallels_co_readv,
> +    .bdrv_co_writev = parallels_co_writev,
>   };
>   
>   static void bdrv_parallels_init(void)
Stefan Hajnoczi April 22, 2015, 1 p.m. UTC | #2
On Wed, Mar 11, 2015 at 01:28:02PM +0300, Denis V. Lunev wrote:
> +static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> +{
> +    BDRVParallelsState *s = bs->opaque;
> +    uint64_t bytes_done = 0;
> +    QEMUIOVector hd_qiov;
> +    int ret = 0;
> +
> +    qemu_iovec_init(&hd_qiov, qiov->niov);
> +
> +    qemu_co_mutex_lock(&s->lock);
> +    while (nb_sectors > 0) {
> +        int64_t position = allocate_cluster(bs, sector_num);
> +        int n = cluster_remainder(s, sector_num, nb_sectors);
> +        int nbytes = n << BDRV_SECTOR_BITS;
> +
> +        if (position < 0) {
> +            ret = (int)position;
> +            break;
> +        }
> +
> +        qemu_iovec_reset(&hd_qiov);
> +        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
> +
> +        qemu_co_mutex_unlock(&s->lock);
> +        ret = bdrv_co_writev(bs->file, position, n, &hd_qiov);
> +        qemu_co_mutex_lock(&s->lock);
> +
> +        if (ret < 0) {
> +            goto fail;

Missing unlock if goto is taken.  I'd suggest dropping the 'fail' label
and using break.

> +        }
> +
> +        nb_sectors -= n;
> +        sector_num += n;
> +        bytes_done += nbytes;
> +    }
> +    qemu_co_mutex_unlock(&s->lock);
> +
> +fail:
> +    qemu_iovec_destroy(&hd_qiov);
> +    return ret;
> +}
Stefan Hajnoczi April 22, 2015, 1:08 p.m. UTC | #3
On Wed, Mar 11, 2015 at 01:28:02PM +0300, Denis V. Lunev wrote:
> +static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
> +{
> +    BDRVParallelsState *s = bs->opaque;
> +    uint32_t idx, offset, tmp;
> +    int64_t pos;
> +    int ret;
> +
> +    idx = sector_num / s->tracks;
> +    offset = sector_num % s->tracks;
> +
> +    if (idx >= s->catalog_size) {
> +        return -EINVAL;
> +    }
> +    if (s->catalog_bitmap[idx] != 0) {
> +        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
> +    }
> +
> +    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
> +    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
> +    s->catalog_bitmap[idx] = pos / s->off_multiplier;
> +
> +    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
> +
> +    ret = bdrv_pwrite_sync(bs->file,
> +            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));

What is the purpose of the sync?

> +    if (ret < 0) {
> +        return ret;
> +    }
> +    return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
> +}

This function is missing error handling.  If the catalog bitmap update
cannot be written to file then our in-memory copy should also be
reverted back to 0 (unallocated).
Denis V. Lunev April 22, 2015, 1:16 p.m. UTC | #4
On 22/04/15 16:08, Stefan Hajnoczi wrote:
> On Wed, Mar 11, 2015 at 01:28:02PM +0300, Denis V. Lunev wrote:
>> +static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
>> +{
>> +    BDRVParallelsState *s = bs->opaque;
>> +    uint32_t idx, offset, tmp;
>> +    int64_t pos;
>> +    int ret;
>> +
>> +    idx = sector_num / s->tracks;
>> +    offset = sector_num % s->tracks;
>> +
>> +    if (idx >= s->catalog_size) {
>> +        return -EINVAL;
>> +    }
>> +    if (s->catalog_bitmap[idx] != 0) {
>> +        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
>> +    }
>> +
>> +    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
>> +    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
>> +    s->catalog_bitmap[idx] = pos / s->off_multiplier;
>> +
>> +    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
>> +
>> +    ret = bdrv_pwrite_sync(bs->file,
>> +            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));
> What is the purpose of the sync?
This is necessary to preserve image consistency on crash from
my point of view. There is no check consistency at the moment.
The sync will be removed later when proper crash detection
code will be added (patches 19, 20, 21)

On the other hand this _sync  was borrowed from qcow.c:get_cluster_offset
which was used as a base of this work.
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +    return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
>> +}
> This function is missing error handling.  If the catalog bitmap update
> cannot be written to file then our in-memory copy should also be
> reverted back to 0 (unallocated).
ok
Stefan Hajnoczi April 23, 2015, 9:20 a.m. UTC | #5
On Wed, Apr 22, 2015 at 04:16:38PM +0300, Denis V. Lunev wrote:
> On 22/04/15 16:08, Stefan Hajnoczi wrote:
> >On Wed, Mar 11, 2015 at 01:28:02PM +0300, Denis V. Lunev wrote:
> >>+static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
> >>+{
> >>+    BDRVParallelsState *s = bs->opaque;
> >>+    uint32_t idx, offset, tmp;
> >>+    int64_t pos;
> >>+    int ret;
> >>+
> >>+    idx = sector_num / s->tracks;
> >>+    offset = sector_num % s->tracks;
> >>+
> >>+    if (idx >= s->catalog_size) {
> >>+        return -EINVAL;
> >>+    }
> >>+    if (s->catalog_bitmap[idx] != 0) {
> >>+        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
> >>+    }
> >>+
> >>+    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
> >>+    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
> >>+    s->catalog_bitmap[idx] = pos / s->off_multiplier;
> >>+
> >>+    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
> >>+
> >>+    ret = bdrv_pwrite_sync(bs->file,
> >>+            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));
> >What is the purpose of the sync?
> This is necessary to preserve image consistency on crash from
> my point of view. There is no check consistency at the moment.
> The sync will be removed later when proper crash detection
> code will be added (patches 19, 20, 21)

Let's look at possible orderings in case of failure:

A. BAT update
B. Data write

This sync enforces A, B ordering.  If we can see B, then A must also
have happened thanks to the sync.

But A, B ordering is too conservative.  Imagine B, A ordering and the
failure where we crash before A.  It means we wrote the data but never
linked it into the BAT.

What happens in that case?  We've leaked a cluster in the underlying
image file but it doesn't corrupt the visible disk from the guest
point-of-view.

Because your implementation uses truncate to extend the file size before
A, even the A, B failure case results in a leaked cluster.  So the B, A
case is not worse in any way.

Why do other image formats sync cluster allocation updates?  Because
they support backing files and in that case an A, B ordering results in
data corruption so they enforce B, A ordering (the opposite of what
you're trying to do!).

The reason why A, B ordering results in data corruption when backing
files are in use is because the guest's write request might touch only a
subset of the cluster (a couple of sectors out of the whole cluster).
So the guest needs to copy the remaining sectors from the backing file.
If there is a dangling BAT entry like in the A, B failure case, then the
guest will see a zeroed cluster instead of the contents of the backing
file.  This is a data corruption, but only if a backing file is being
used!

So the sync is not necessary, both A, B and B, A ordering work for
block/parallels.c.

Stefan
Kevin Wolf April 23, 2015, 9:32 a.m. UTC | #6
Am 23.04.2015 um 11:20 hat Stefan Hajnoczi geschrieben:
> On Wed, Apr 22, 2015 at 04:16:38PM +0300, Denis V. Lunev wrote:
> > On 22/04/15 16:08, Stefan Hajnoczi wrote:
> > >On Wed, Mar 11, 2015 at 01:28:02PM +0300, Denis V. Lunev wrote:
> > >>+static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
> > >>+{
> > >>+    BDRVParallelsState *s = bs->opaque;
> > >>+    uint32_t idx, offset, tmp;
> > >>+    int64_t pos;
> > >>+    int ret;
> > >>+
> > >>+    idx = sector_num / s->tracks;
> > >>+    offset = sector_num % s->tracks;
> > >>+
> > >>+    if (idx >= s->catalog_size) {
> > >>+        return -EINVAL;
> > >>+    }
> > >>+    if (s->catalog_bitmap[idx] != 0) {
> > >>+        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
> > >>+    }
> > >>+
> > >>+    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
> > >>+    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
> > >>+    s->catalog_bitmap[idx] = pos / s->off_multiplier;
> > >>+
> > >>+    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
> > >>+
> > >>+    ret = bdrv_pwrite_sync(bs->file,
> > >>+            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));
> > >What is the purpose of the sync?
> > This is necessary to preserve image consistency on crash from
> > my point of view. There is no check consistency at the moment.
> > The sync will be removed later when proper crash detection
> > code will be added (patches 19, 20, 21)
> 
> Let's look at possible orderings in case of failure:
> 
> A. BAT update
> B. Data write
> 
> This sync enforces A, B ordering.  If we can see B, then A must also
> have happened thanks to the sync.
> 
> But A, B ordering is too conservative.  Imagine B, A ordering and the
> failure where we crash before A.  It means we wrote the data but never
> linked it into the BAT.
> 
> What happens in that case?  We've leaked a cluster in the underlying
> image file but it doesn't corrupt the visible disk from the guest
> point-of-view.
> 
> Because your implementation uses truncate to extend the file size before
> A, even the A, B failure case results in a leaked cluster.  So the B, A
> case is not worse in any way.
> 
> Why do other image formats sync cluster allocation updates?  Because
> they support backing files and in that case an A, B ordering results in
> data corruption so they enforce B, A ordering (the opposite of what
> you're trying to do!).
> 
> The reason why A, B ordering results in data corruption when backing
> files are in use is because the guest's write request might touch only a
> subset of the cluster (a couple of sectors out of the whole cluster).
> So the guest needs to copy the remaining sectors from the backing file.
> If there is a dangling BAT entry like in the A, B failure case, then the
> guest will see a zeroed cluster instead of the contents of the backing
> file.  This is a data corruption, but only if a backing file is being
> used!
> 
> So the sync is not necessary, both A, B and B, A ordering work for
> block/parallels.c.

Actually, I suspect this means that the parallels driver is restricted
to protocols with bdrv_has_zero_init() == true, otherwise zeros can turn
into random data (which means that it can't work e.g. directly on host
block devices).

Do we enforce this?

Kevin
Denis V. Lunev April 23, 2015, 9:36 a.m. UTC | #7
On 23/04/15 12:20, Stefan Hajnoczi wrote:
> On Wed, Apr 22, 2015 at 04:16:38PM +0300, Denis V. Lunev wrote:
>> On 22/04/15 16:08, Stefan Hajnoczi wrote:
>>> On Wed, Mar 11, 2015 at 01:28:02PM +0300, Denis V. Lunev wrote:
>>>> +static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
>>>> +{
>>>> +    BDRVParallelsState *s = bs->opaque;
>>>> +    uint32_t idx, offset, tmp;
>>>> +    int64_t pos;
>>>> +    int ret;
>>>> +
>>>> +    idx = sector_num / s->tracks;
>>>> +    offset = sector_num % s->tracks;
>>>> +
>>>> +    if (idx >= s->catalog_size) {
>>>> +        return -EINVAL;
>>>> +    }
>>>> +    if (s->catalog_bitmap[idx] != 0) {
>>>> +        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
>>>> +    }
>>>> +
>>>> +    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
>>>> +    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
>>>> +    s->catalog_bitmap[idx] = pos / s->off_multiplier;
>>>> +
>>>> +    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
>>>> +
>>>> +    ret = bdrv_pwrite_sync(bs->file,
>>>> +            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));
>>> What is the purpose of the sync?
>> This is necessary to preserve image consistency on crash from
>> my point of view. There is no check consistency at the moment.
>> The sync will be removed later when proper crash detection
>> code will be added (patches 19, 20, 21)
> Let's look at possible orderings in case of failure:
>
> A. BAT update
> B. Data write
>
> This sync enforces A, B ordering.  If we can see B, then A must also
> have happened thanks to the sync.
>
> But A, B ordering is too conservative.  Imagine B, A ordering and the
> failure where we crash before A.  It means we wrote the data but never
> linked it into the BAT.
>
> What happens in that case?  We've leaked a cluster in the underlying
> image file but it doesn't corrupt the visible disk from the guest
> point-of-view.
>
> Because your implementation uses truncate to extend the file size before
> A, even the A, B failure case results in a leaked cluster.  So the B, A
> case is not worse in any way.
>
> Why do other image formats sync cluster allocation updates?  Because
> they support backing files and in that case an A, B ordering results in
> data corruption so they enforce B, A ordering (the opposite of what
> you're trying to do!).
>
> The reason why A, B ordering results in data corruption when backing
> files are in use is because the guest's write request might touch only a
> subset of the cluster (a couple of sectors out of the whole cluster).
> So the guest needs to copy the remaining sectors from the backing file.
> If there is a dangling BAT entry like in the A, B failure case, then the
> guest will see a zeroed cluster instead of the contents of the backing
> file.  This is a data corruption, but only if a backing file is being
> used!
>
> So the sync is not necessary, both A, B and B, A ordering work for
> block/parallels.c.
>
> Stefan
this is very interesting speculation. I have never considered the
problem from this point of view. Thank you very much for this!
Denis V. Lunev April 23, 2015, 9:47 a.m. UTC | #8
On 23/04/15 12:32, Kevin Wolf wrote:
> Am 23.04.2015 um 11:20 hat Stefan Hajnoczi geschrieben:
>> On Wed, Apr 22, 2015 at 04:16:38PM +0300, Denis V. Lunev wrote:
>>> On 22/04/15 16:08, Stefan Hajnoczi wrote:
>>>> On Wed, Mar 11, 2015 at 01:28:02PM +0300, Denis V. Lunev wrote:
>>>>> +static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
>>>>> +{
>>>>> +    BDRVParallelsState *s = bs->opaque;
>>>>> +    uint32_t idx, offset, tmp;
>>>>> +    int64_t pos;
>>>>> +    int ret;
>>>>> +
>>>>> +    idx = sector_num / s->tracks;
>>>>> +    offset = sector_num % s->tracks;
>>>>> +
>>>>> +    if (idx >= s->catalog_size) {
>>>>> +        return -EINVAL;
>>>>> +    }
>>>>> +    if (s->catalog_bitmap[idx] != 0) {
>>>>> +        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
>>>>> +    }
>>>>> +
>>>>> +    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
>>>>> +    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
>>>>> +    s->catalog_bitmap[idx] = pos / s->off_multiplier;
>>>>> +
>>>>> +    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
>>>>> +
>>>>> +    ret = bdrv_pwrite_sync(bs->file,
>>>>> +            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));
>>>> What is the purpose of the sync?
>>> This is necessary to preserve image consistency on crash from
>>> my point of view. There is no check consistency at the moment.
>>> The sync will be removed later when proper crash detection
>>> code will be added (patches 19, 20, 21)
>> Let's look at possible orderings in case of failure:
>>
>> A. BAT update
>> B. Data write
>>
>> This sync enforces A, B ordering.  If we can see B, then A must also
>> have happened thanks to the sync.
>>
>> But A, B ordering is too conservative.  Imagine B, A ordering and the
>> failure where we crash before A.  It means we wrote the data but never
>> linked it into the BAT.
>>
>> What happens in that case?  We've leaked a cluster in the underlying
>> image file but it doesn't corrupt the visible disk from the guest
>> point-of-view.
>>
>> Because your implementation uses truncate to extend the file size before
>> A, even the A, B failure case results in a leaked cluster.  So the B, A
>> case is not worse in any way.
>>
>> Why do other image formats sync cluster allocation updates?  Because
>> they support backing files and in that case an A, B ordering results in
>> data corruption so they enforce B, A ordering (the opposite of what
>> you're trying to do!).
>>
>> The reason why A, B ordering results in data corruption when backing
>> files are in use is because the guest's write request might touch only a
>> subset of the cluster (a couple of sectors out of the whole cluster).
>> So the guest needs to copy the remaining sectors from the backing file.
>> If there is a dangling BAT entry like in the A, B failure case, then the
>> guest will see a zeroed cluster instead of the contents of the backing
>> file.  This is a data corruption, but only if a backing file is being
>> used!
>>
>> So the sync is not necessary, both A, B and B, A ordering work for
>> block/parallels.c.
> Actually, I suspect this means that the parallels driver is restricted
> to protocols with bdrv_has_zero_init() == true, otherwise zeros can turn
> into random data (which means that it can't work e.g. directly on host
> block devices).
>
> Do we enforce this?
>
> Kevin
this is fixed in the patch 26 when the code is replaced with

+    if (s->data_end + s->tracks > pos) {
+        int ret;
+        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
+            ret = bdrv_write_zeroes(bs->file, s->data_end,
+                                    s->prealloc_size, 0);
+        } else {
+            ret = bdrv_truncate(bs->file,
+                    (s->data_end + s->prealloc_size) << BDRV_SECTOR_BITS);
+        }
+        if (ret < 0) {
+            return ret;
+        }
+    }

on a default path, but you are correct. Some checking is
necessary to be on a safe side.

Den
Kevin Wolf April 23, 2015, 10:09 a.m. UTC | #9
Am 23.04.2015 um 11:47 hat Denis V. Lunev geschrieben:
> On 23/04/15 12:32, Kevin Wolf wrote:
> >Am 23.04.2015 um 11:20 hat Stefan Hajnoczi geschrieben:
> >>On Wed, Apr 22, 2015 at 04:16:38PM +0300, Denis V. Lunev wrote:
> >>>On 22/04/15 16:08, Stefan Hajnoczi wrote:
> >>>>On Wed, Mar 11, 2015 at 01:28:02PM +0300, Denis V. Lunev wrote:
> >>>>>+static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
> >>>>>+{
> >>>>>+    BDRVParallelsState *s = bs->opaque;
> >>>>>+    uint32_t idx, offset, tmp;
> >>>>>+    int64_t pos;
> >>>>>+    int ret;
> >>>>>+
> >>>>>+    idx = sector_num / s->tracks;
> >>>>>+    offset = sector_num % s->tracks;
> >>>>>+
> >>>>>+    if (idx >= s->catalog_size) {
> >>>>>+        return -EINVAL;
> >>>>>+    }
> >>>>>+    if (s->catalog_bitmap[idx] != 0) {
> >>>>>+        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
> >>>>>+    }
> >>>>>+
> >>>>>+    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
> >>>>>+    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
> >>>>>+    s->catalog_bitmap[idx] = pos / s->off_multiplier;
> >>>>>+
> >>>>>+    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
> >>>>>+
> >>>>>+    ret = bdrv_pwrite_sync(bs->file,
> >>>>>+            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));
> >>>>What is the purpose of the sync?
> >>>This is necessary to preserve image consistency on crash from
> >>>my point of view. There is no check consistency at the moment.
> >>>The sync will be removed later when proper crash detection
> >>>code will be added (patches 19, 20, 21)
> >>Let's look at possible orderings in case of failure:
> >>
> >>A. BAT update
> >>B. Data write
> >>
> >>This sync enforces A, B ordering.  If we can see B, then A must also
> >>have happened thanks to the sync.
> >>
> >>But A, B ordering is too conservative.  Imagine B, A ordering and the
> >>failure where we crash before A.  It means we wrote the data but never
> >>linked it into the BAT.
> >>
> >>What happens in that case?  We've leaked a cluster in the underlying
> >>image file but it doesn't corrupt the visible disk from the guest
> >>point-of-view.
> >>
> >>Because your implementation uses truncate to extend the file size before
> >>A, even the A, B failure case results in a leaked cluster.  So the B, A
> >>case is not worse in any way.
> >>
> >>Why do other image formats sync cluster allocation updates?  Because
> >>they support backing files and in that case an A, B ordering results in
> >>data corruption so they enforce B, A ordering (the opposite of what
> >>you're trying to do!).
> >>
> >>The reason why A, B ordering results in data corruption when backing
> >>files are in use is because the guest's write request might touch only a
> >>subset of the cluster (a couple of sectors out of the whole cluster).
> >>So the guest needs to copy the remaining sectors from the backing file.
> >>If there is a dangling BAT entry like in the A, B failure case, then the
> >>guest will see a zeroed cluster instead of the contents of the backing
> >>file.  This is a data corruption, but only if a backing file is being
> >>used!
> >>
> >>So the sync is not necessary, both A, B and B, A ordering work for
> >>block/parallels.c.
> >Actually, I suspect this means that the parallels driver is restricted
> >to protocols with bdrv_has_zero_init() == true, otherwise zeros can turn
> >into random data (which means that it can't work e.g. directly on host
> >block devices).
> >
> >Do we enforce this?
> >
> >Kevin
> this is fixed in the patch 26 when the code is replaced with
> 
> +    if (s->data_end + s->tracks > pos) {
> +        int ret;
> +        if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) {
> +            ret = bdrv_write_zeroes(bs->file, s->data_end,
> +                                    s->prealloc_size, 0);
> +        } else {
> +            ret = bdrv_truncate(bs->file,
> +                    (s->data_end + s->prealloc_size) << BDRV_SECTOR_BITS);
> +        }
> +        if (ret < 0) {
> +            return ret;
> +        }
> +    }
> 
> on a default path, but you are correct. Some checking is
> necessary to be on a safe side.

Ah, yes, you're right. That should fix it for the default settings. The
check is only needed when using the bdrv_truncate() path.

Kevin
diff mbox

Patch

diff --git a/block/parallels.c b/block/parallels.c
index 306f2e3..61d7da7 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -81,8 +81,6 @@  static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
     ParallelsHeader ph;
     int ret;
 
-    bs->read_only = 1; // no write support yet
-
     ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph));
     if (ret < 0) {
         goto fail;
@@ -166,6 +164,37 @@  static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num,
     return MIN(nb_sectors, ret);
 }
 
+static int64_t allocate_cluster(BlockDriverState *bs, int64_t sector_num)
+{
+    BDRVParallelsState *s = bs->opaque;
+    uint32_t idx, offset, tmp;
+    int64_t pos;
+    int ret;
+
+    idx = sector_num / s->tracks;
+    offset = sector_num % s->tracks;
+
+    if (idx >= s->catalog_size) {
+        return -EINVAL;
+    }
+    if (s->catalog_bitmap[idx] != 0) {
+        return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
+    }
+
+    pos = bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS;
+    bdrv_truncate(bs->file, (pos + s->tracks) << BDRV_SECTOR_BITS);
+    s->catalog_bitmap[idx] = pos / s->off_multiplier;
+
+    tmp = cpu_to_le32(s->catalog_bitmap[idx]);
+
+    ret = bdrv_pwrite_sync(bs->file,
+            sizeof(ParallelsHeader) + idx * sizeof(tmp), &tmp, sizeof(tmp));
+    if (ret < 0) {
+        return ret;
+    }
+    return (uint64_t)s->catalog_bitmap[idx] * s->off_multiplier + offset;
+}
+
 static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
@@ -186,6 +215,49 @@  static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
         BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 }
 
+static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    BDRVParallelsState *s = bs->opaque;
+    uint64_t bytes_done = 0;
+    QEMUIOVector hd_qiov;
+    int ret = 0;
+
+    qemu_iovec_init(&hd_qiov, qiov->niov);
+
+    qemu_co_mutex_lock(&s->lock);
+    while (nb_sectors > 0) {
+        int64_t position = allocate_cluster(bs, sector_num);
+        int n = cluster_remainder(s, sector_num, nb_sectors);
+        int nbytes = n << BDRV_SECTOR_BITS;
+
+        if (position < 0) {
+            ret = (int)position;
+            break;
+        }
+
+        qemu_iovec_reset(&hd_qiov);
+        qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes);
+
+        qemu_co_mutex_unlock(&s->lock);
+        ret = bdrv_co_writev(bs->file, position, n, &hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+
+        if (ret < 0) {
+            goto fail;
+        }
+
+        nb_sectors -= n;
+        sector_num += n;
+        bytes_done += nbytes;
+    }
+    qemu_co_mutex_unlock(&s->lock);
+
+fail:
+    qemu_iovec_destroy(&hd_qiov);
+    return ret;
+}
+
 static coroutine_fn int parallels_co_readv(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
@@ -242,6 +314,7 @@  static BlockDriver bdrv_parallels = {
     .bdrv_close		= parallels_close,
     .bdrv_co_get_block_status = parallels_co_get_block_status,
     .bdrv_co_readv  = parallels_co_readv,
+    .bdrv_co_writev = parallels_co_writev,
 };
 
 static void bdrv_parallels_init(void)