diff mbox series

[1/5] block: Add blklogwrites

Message ID 1527801443-30620-1-git-send-email-ari@tuxera.com
State New
Headers show
Series [1/5] block: Add blklogwrites | expand

Commit Message

Ari Sundholm May 31, 2018, 9:17 p.m. UTC
From: Aapo Vienamo <aapo@tuxera.com>

Implements a block device write logging system, similar to Linux kernel
device mapper dm-log-writes. The write operations that are performed
on a block device are logged to a file or another block device. The
write log format is identical to the dm-log-writes format. Currently,
log markers are not supported.

This functionality can be used for fail-safe and fs consistency
testing. By implementing it in qemu, tests utilizing write logs can be
be used to test non-Linux drivers and older kernels.

The implementation is based on the blkverify and blkdebug block drivers.

Signed-off-by: Aapo Vienamo <aapo@tuxera.com>
Signed-off-by: Ari Sundholm <ari@tuxera.com>
---
 block.c               |   6 -
 block/Makefile.objs   |   1 +
 block/blklogwrites.c  | 441 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/block/block.h |   7 +
 4 files changed, 449 insertions(+), 6 deletions(-)
 create mode 100644 block/blklogwrites.c

Comments

Eric Blake June 1, 2018, 12:26 p.m. UTC | #1
On 05/31/2018 04:17 PM, Ari Sundholm wrote:
> From: Aapo Vienamo <aapo@tuxera.com>
> 
> Implements a block device write logging system, similar to Linux kernel

[meta-comment]

Your patch threading is awkward - you forgot a cover letter, and then 
you did deep threading (every message in-reply-to the previous) instead 
of shallow threading (all messages in-reply-to only the 0/5 cover 
letter).  This makes it harder for automated tooling to recognize your 
series properly.

More patch submission hints at:
http://wiki.qemu.org/Contribute/SubmitAPatch

> device mapper dm-log-writes. The write operations that are performed
> on a block device are logged to a file or another block device. The
> write log format is identical to the dm-log-writes format. Currently,
> log markers are not supported.
> 
> This functionality can be used for fail-safe and fs consistency
> testing. By implementing it in qemu, tests utilizing write logs can be
> be used to test non-Linux drivers and older kernels.
> 
> The implementation is based on the blkverify and blkdebug block drivers.
> 
> Signed-off-by: Aapo Vienamo <aapo@tuxera.com>
> Signed-off-by: Ari Sundholm <ari@tuxera.com>
> ---
>   block.c               |   6 -
>   block/Makefile.objs   |   1 +
>   block/blklogwrites.c  | 441 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   include/block/block.h |   7 +
>   4 files changed, 449 insertions(+), 6 deletions(-)
>   create mode 100644 block/blklogwrites.c

Without a cover letter, I'll have to read the entire series to see if 
there is something obviously missing.  For example, I don't yet know if 
a later patch adds to qapi/block-core.json to allow creation of this new 
block device from QMP.

> 
> diff --git a/block.c b/block.c
> index 501b64c..c8cffe1 100644
> --- a/block.c
> +++ b/block.c
> @@ -1914,12 +1914,6 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
>       return 0;
>   }
>   
> -#define DEFAULT_PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
> -                                 | BLK_PERM_WRITE \
> -                                 | BLK_PERM_WRITE_UNCHANGED \
> -                                 | BLK_PERM_RESIZE)
> -#define DEFAULT_PERM_UNCHANGED (BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH)
> -

Your commit message didn't mention why this code motion was necessary; 
in fact, if it IS necessary, it might be better to split it off into a 
separate patch.

> +++ b/block/blklogwrites.c
> @@ -0,0 +1,441 @@
> +/*
> + * Write logging blk driver based on blkverify and blkdebug.
> + *
> + * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com>
> + * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com>
> + * Copyright (c) 2018 Ari Sundholm <ari@tuxera.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
> +#include "block/block_int.h"
> +#include "qapi/qmp/qdict.h"
> +#include "qapi/qmp/qstring.h"
> +#include "qemu/cutils.h"
> +#include "qemu/option.h"
> +
> +/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */

Your copyright claims GPLv2+; but drivers/md/dm-log-writes.c states only 
"* This file is released under the GPL." - I guess that means you're 
okay (parts of Linux are GPLv2-only, which we can't copy into a GPLv2+ 
file).

> +
> +/* Valid blk_log_writes filenames look like:
> + *      blk_log_writes:path/to/raw_image:path/to/logfile */
> +static void blk_log_writes_parse_filename(const char *filename, QDict *options,
> +                                          Error **errp)

Do we still want to support yet another legacy syntax addition?  I'd 
rather just require the user to pass in a valid --blockdev specification 
(which implies that you MUST support a QMP description), and skip the 
legacy syntax altogether.

> +
> +static QemuOptsList runtime_opts = {
> +    .name = "blk_log_writes",
> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> +    .desc = {
> +        {
> +            .name = "x-raw",
> +            .type = QEMU_OPT_STRING,
> +            .help = "[internal use only, will be removed]",
> +        },
> +        {
> +            .name = "x-log",
> +            .type = QEMU_OPT_STRING,
> +            .help = "[internal use only, will be removed]",

And the fact that you are adding internal options from the getgo points 
to the fact that we really want a QMP interface.


> +static void blk_log_writes_refresh_limits(BlockDriverState *bs, Error **errp)
> +{
> +    if (bs->bl.request_alignment < BDRV_SECTOR_SIZE) {

Why do you need sector alignment?  And in what cases would 
bs->bl.request_alignment be larger than sector alignment (does the block 
layer take into account if your log file and/or child BDS already have a 
larger alignment?)

> +        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
> +
> +        if (bs->bl.pdiscard_alignment &&
> +                bs->bl.pdiscard_alignment < bs->bl.request_alignment)
> +            bs->bl.pdiscard_alignment = bs->bl.request_alignment;

Every 'if' MUST use {}.  scripts/checkpatch.pl should have flagged this.

> +static void coroutine_fn blk_log_writes_co_do_log(void *opaque)
> +{

> +
> +    lr->r->done++;
> +    qemu_coroutine_enter_if_inactive(lr->r->co);
> +}
> +
> +static void blk_log_writes_co_do_file(void *opaque)
> +{
> +    BlkLogWritesFileReq *fr = opaque;
> +
> +    fr->file_ret = fr->func(fr);
> +
> +    fr->r->done++;

Two non-atomic increments...

> +    qemu_coroutine_enter_if_inactive(fr->r->co);
> +}
> +
> +static int coroutine_fn
> +blk_log_writes_co_log(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
> +                      QEMUIOVector *qiov, int flags,
> +                      int (*file_func)(BlkLogWritesFileReq *r),
> +                      uint64_t entry_flags)
> +{

> +    qemu_coroutine_enter(co_file);
> +    qemu_coroutine_enter(co_log);
> +
> +    while (r.done < 2) {
> +        qemu_coroutine_yield();
> +    }

...used as the condition for waiting.  Since the point of coroutines is 
to allow (restricted) parallel operation, there's a chance that the 
coroutine implementation can be utilizing parallel threads; if that's 
the case, then on the rare race when both threads try to increment at 
near the same time, they can both read 0 and write 1, at which point 
this wait loop would be an infinite loop.  You're probably better off 
using atomics (even if I'm wrong about coroutines being able to race 
each other on the increment, as the other point of coroutines is that 
they provide restricted parallelism where you can also implement them in 
only a single thread because of well-defined yield points).


> +static BlockDriver bdrv_blk_log_writes = {
> +    .format_name            = "blk_log_writes",
> +    .protocol_name          = "blk_log_writes",
> +    .instance_size          = sizeof(BDRVBlkLogWritesState),
> +
> +    .bdrv_parse_filename    = blk_log_writes_parse_filename,
> +    .bdrv_file_open         = blk_log_writes_open,
> +    .bdrv_close             = blk_log_writes_close,
> +    .bdrv_getlength         = blk_log_writes_getlength,
> +    .bdrv_refresh_filename  = blk_log_writes_refresh_filename,
> +    .bdrv_child_perm        = blk_log_writes_child_perm,
> +    .bdrv_refresh_limits    = blk_log_writes_refresh_limits,
> +
> +    .bdrv_co_preadv         = blk_log_writes_co_preadv,
> +    .bdrv_co_pwritev        = blk_log_writes_co_pwritev,
> +    .bdrv_co_flush_to_disk  = blk_log_writes_co_flush_to_disk,
> +    .bdrv_co_pdiscard       = blk_log_writes_co_pdiscard,
> +
> +    .is_filter              = true,
> +};

No .bdrv_co_pwrite_zeroes support?  While the block layer will emulate 
zero support on top of .bdrv_co_pwritev, that is probably a performance 
killer compared to you implementing it directly.

Also, you probably want .bdrv_co_block_status = 
bdrv_co_block_status_from_file, so that the block layer sees the 
underlying device's block status rather than treating the entire device 
as non-sparse.
Ari Sundholm June 1, 2018, 1:31 p.m. UTC | #2
On 06/01/2018 03:26 PM, Eric Blake wrote:
> On 05/31/2018 04:17 PM, Ari Sundholm wrote:
>> From: Aapo Vienamo <aapo@tuxera.com>
>>
>> Implements a block device write logging system, similar to Linux kernel
> 
> [meta-comment]
> 
> Your patch threading is awkward - you forgot a cover letter, and then 
> you did deep threading (every message in-reply-to the previous) instead 
> of shallow threading (all messages in-reply-to only the 0/5 cover 
> letter).  This makes it harder for automated tooling to recognize your 
> series properly.
> 
> More patch submission hints at:
> http://wiki.qemu.org/Contribute/SubmitAPatch
> 

Thank you, I will improve this in the next version. I was not aware that 
the threading choice breaks the tooling. I should definitely have 
included a cover letter and perhaps moved some of the comment in the 
first patch there, with more explanation about some of the choices made.

>> device mapper dm-log-writes. The write operations that are performed
>> on a block device are logged to a file or another block device. The
>> write log format is identical to the dm-log-writes format. Currently,
>> log markers are not supported.
>>
>> This functionality can be used for fail-safe and fs consistency
>> testing. By implementing it in qemu, tests utilizing write logs can be
>> be used to test non-Linux drivers and older kernels.
>>
>> The implementation is based on the blkverify and blkdebug block drivers.
>>
>> Signed-off-by: Aapo Vienamo <aapo@tuxera.com>
>> Signed-off-by: Ari Sundholm <ari@tuxera.com>
>> ---
>>   block.c               |   6 -
>>   block/Makefile.objs   |   1 +
>>   block/blklogwrites.c  | 441 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   include/block/block.h |   7 +
>>   4 files changed, 449 insertions(+), 6 deletions(-)
>>   create mode 100644 block/blklogwrites.c
> 
> Without a cover letter, I'll have to read the entire series to see if 
> there is something obviously missing.  For example, I don't yet know if 
> a later patch adds to qapi/block-core.json to allow creation of this new 
> block device from QMP.
> 
>>
>> diff --git a/block.c b/block.c
>> index 501b64c..c8cffe1 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -1914,12 +1914,6 @@ int bdrv_child_try_set_perm(BdrvChild *c, 
>> uint64_t perm, uint64_t shared,
>>       return 0;
>>   }
>> -#define DEFAULT_PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
>> -                                 | BLK_PERM_WRITE \
>> -                                 | BLK_PERM_WRITE_UNCHANGED \
>> -                                 | BLK_PERM_RESIZE)
>> -#define DEFAULT_PERM_UNCHANGED (BLK_PERM_ALL & 
>> ~DEFAULT_PERM_PASSTHROUGH)
>> -
> 
> Your commit message didn't mention why this code motion was necessary; 
> in fact, if it IS necessary, it might be better to split it off into a 
> separate patch.
> 

This was just to allow using these constants outside of block.c. Will 
split this off into a separate patch, as requested.

>> +++ b/block/blklogwrites.c
>> @@ -0,0 +1,441 @@
>> +/*
>> + * Write logging blk driver based on blkverify and blkdebug.
>> + *
>> + * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com>
>> + * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com>
>> + * Copyright (c) 2018 Ari Sundholm <ari@tuxera.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or 
>> later.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qapi/error.h"
>> +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
>> +#include "block/block_int.h"
>> +#include "qapi/qmp/qdict.h"
>> +#include "qapi/qmp/qstring.h"
>> +#include "qemu/cutils.h"
>> +#include "qemu/option.h"
>> +
>> +/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */
> 
> Your copyright claims GPLv2+; but drivers/md/dm-log-writes.c states only 
> "* This file is released under the GPL." - I guess that means you're 
> okay (parts of Linux are GPLv2-only, which we can't copy into a GPLv2+ 
> file).
> 

I consulted legal counsel on this part, and it is our opinion that the 
constants and structs taken from the Linux kernel are insignificant and 
required for compatibility, so it should be OK to license the file as 
GPLv2+ in any case. Just to confirm, do you see this differently?

>> +
>> +/* Valid blk_log_writes filenames look like:
>> + *      blk_log_writes:path/to/raw_image:path/to/logfile */
>> +static void blk_log_writes_parse_filename(const char *filename, QDict 
>> *options,
>> +                                          Error **errp)
> 
> Do we still want to support yet another legacy syntax addition?  I'd 
> rather just require the user to pass in a valid --blockdev specification 
> (which implies that you MUST support a QMP description), and skip the 
> legacy syntax altogether.
> 

I believe this was inherited from blkverify, which was used as a basis. 
I will have to do some research into how to do this properly for this 
kind of a driver which requires multiple filenames. Any pointers would 
be appreciated.

>> +
>> +static QemuOptsList runtime_opts = {
>> +    .name = "blk_log_writes",
>> +    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
>> +    .desc = {
>> +        {
>> +            .name = "x-raw",
>> +            .type = QEMU_OPT_STRING,
>> +            .help = "[internal use only, will be removed]",
>> +        },
>> +        {
>> +            .name = "x-log",
>> +            .type = QEMU_OPT_STRING,
>> +            .help = "[internal use only, will be removed]",
> 
> And the fact that you are adding internal options from the getgo points 
> to the fact that we really want a QMP interface.
> 

Will try to find out how to do this properly.

> 
>> +static void blk_log_writes_refresh_limits(BlockDriverState *bs, Error 
>> **errp)
>> +{
>> +    if (bs->bl.request_alignment < BDRV_SECTOR_SIZE) {
> 
> Why do you need sector alignment?  And in what cases would 
> bs->bl.request_alignment be larger than sector alignment (does the block 
> layer take into account if your log file and/or child BDS already have a 
> larger alignment?)
> 

Sector alignment is needed because the write logging takes place at 
sector granularity. Each write log entry contains the sector offset and 
the number of sectors the write spanned. I'll try to make this clearer 
in the next version.

>> +        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
>> +
>> +        if (bs->bl.pdiscard_alignment &&
>> +                bs->bl.pdiscard_alignment < bs->bl.request_alignment)
>> +            bs->bl.pdiscard_alignment = bs->bl.request_alignment;
> 
> Every 'if' MUST use {}.  scripts/checkpatch.pl should have flagged this.
> 

Thanks, will fix this and other instances.

>> +static void coroutine_fn blk_log_writes_co_do_log(void *opaque)
>> +{
> 
>> +
>> +    lr->r->done++;
>> +    qemu_coroutine_enter_if_inactive(lr->r->co);
>> +}
>> +
>> +static void blk_log_writes_co_do_file(void *opaque)
>> +{
>> +    BlkLogWritesFileReq *fr = opaque;
>> +
>> +    fr->file_ret = fr->func(fr);
>> +
>> +    fr->r->done++;
> 
> Two non-atomic increments...
> 
>> +    qemu_coroutine_enter_if_inactive(fr->r->co);
>> +}
>> +
>> +static int coroutine_fn
>> +blk_log_writes_co_log(BlockDriverState *bs, uint64_t offset, uint64_t 
>> bytes,
>> +                      QEMUIOVector *qiov, int flags,
>> +                      int (*file_func)(BlkLogWritesFileReq *r),
>> +                      uint64_t entry_flags)
>> +{
> 
>> +    qemu_coroutine_enter(co_file);
>> +    qemu_coroutine_enter(co_log);
>> +
>> +    while (r.done < 2) {
>> +        qemu_coroutine_yield();
>> +    }
> 
> ...used as the condition for waiting.  Since the point of coroutines is 
> to allow (restricted) parallel operation, there's a chance that the 
> coroutine implementation can be utilizing parallel threads; if that's 
> the case, then on the rare race when both threads try to increment at 
> near the same time, they can both read 0 and write 1, at which point 
> this wait loop would be an infinite loop.  You're probably better off 
> using atomics (even if I'm wrong about coroutines being able to race 
> each other on the increment, as the other point of coroutines is that 
> they provide restricted parallelism where you can also implement them in 
> only a single thread because of well-defined yield points).
> 

Inherited from blkverify, I believe. Will fix.

> 
>> +static BlockDriver bdrv_blk_log_writes = {
>> +    .format_name            = "blk_log_writes",
>> +    .protocol_name          = "blk_log_writes",
>> +    .instance_size          = sizeof(BDRVBlkLogWritesState),
>> +
>> +    .bdrv_parse_filename    = blk_log_writes_parse_filename,
>> +    .bdrv_file_open         = blk_log_writes_open,
>> +    .bdrv_close             = blk_log_writes_close,
>> +    .bdrv_getlength         = blk_log_writes_getlength,
>> +    .bdrv_refresh_filename  = blk_log_writes_refresh_filename,
>> +    .bdrv_child_perm        = blk_log_writes_child_perm,
>> +    .bdrv_refresh_limits    = blk_log_writes_refresh_limits,
>> +
>> +    .bdrv_co_preadv         = blk_log_writes_co_preadv,
>> +    .bdrv_co_pwritev        = blk_log_writes_co_pwritev,
>> +    .bdrv_co_flush_to_disk  = blk_log_writes_co_flush_to_disk,
>> +    .bdrv_co_pdiscard       = blk_log_writes_co_pdiscard,
>> +
>> +    .is_filter              = true,
>> +};
> 
> No .bdrv_co_pwrite_zeroes support?  While the block layer will emulate 
> zero support on top of .bdrv_co_pwritev, that is probably a performance 
> killer compared to you implementing it directly.
> 

I see. I'll have to consider the implications with regard to accurate 
write logging to see if this is feasible and desirable.

> Also, you probably want .bdrv_co_block_status = 
> bdrv_co_block_status_from_file, so that the block layer sees the 
> underlying device's block status rather than treating the entire device 
> as non-sparse.
> 

Thank you, will look into this.

Best regards,
Ari Sundholm
ari@tuxera.com
Stefan Hajnoczi June 1, 2018, 1:32 p.m. UTC | #3
On Fri, Jun 01, 2018 at 12:17:19AM +0300, Ari Sundholm wrote:
> From: Aapo Vienamo <aapo@tuxera.com>

Thanks for the patch!

> Implements a block device write logging system, similar to Linux kernel
> device mapper dm-log-writes. The write operations that are performed
> on a block device are logged to a file or another block device. The
> write log format is identical to the dm-log-writes format. Currently,
> log markers are not supported.
> 
> This functionality can be used for fail-safe and fs consistency
> testing. By implementing it in qemu, tests utilizing write logs can be
> be used to test non-Linux drivers and older kernels.

This patch doesn't implement the same semantics as dm-log-writes, where
only completed writes are logged to make fs consistency testing easier.
If you intend to use it for this purpose, shouldn't it act the same way
as dm-log-writes?

> The implementation is based on the blkverify and blkdebug block drivers.

This functionality overlaps with existing block drivers.  In the
long-term people want to observe I/O requests in various ways beyond
what the dm-log-writes offers.  Therefore I'd like to suggest a
different approach that extends the quorum driver instead:

Add a read-pattern=first parameter to the block/quorum.c driver so it
only reads from the first child.  QEMU can be launched with quorum and
NBD so that writes and flushes are mirrored to the external NBD server.

Any file format or online analysis can be implemented in the NBD server.
There are a number of NBD plugins here:
https://github.com/libguestfs/nbdkit

I think sending the writes and flushes over NBD is more flexible.
Ari Sundholm June 1, 2018, 2:24 p.m. UTC | #4
On 06/01/2018 04:32 PM, Stefan Hajnoczi wrote:
> On Fri, Jun 01, 2018 at 12:17:19AM +0300, Ari Sundholm wrote:
>> From: Aapo Vienamo <aapo@tuxera.com>
> 
> Thanks for the patch!
> 
>> Implements a block device write logging system, similar to Linux kernel
>> device mapper dm-log-writes. The write operations that are performed
>> on a block device are logged to a file or another block device. The
>> write log format is identical to the dm-log-writes format. Currently,
>> log markers are not supported.
>>
>> This functionality can be used for fail-safe and fs consistency
>> testing. By implementing it in qemu, tests utilizing write logs can be
>> be used to test non-Linux drivers and older kernels.
> 
> This patch doesn't implement the same semantics as dm-log-writes, where
> only completed writes are logged to make fs consistency testing easier.
> If you intend to use it for this purpose, shouldn't it act the same way
> as dm-log-writes?
> 

I am not quite sure what you mean. I am not the original author of this 
proposed feature, but to me (admittedly with little experience of qemu 
internals), it looks like the driver accurately logs the writes and 
flushes performed on the guest block device. It intentionally does not 
concern itself with when the write actually hits the physical host block 
device or file, as we're interested in the direct interactions between a 
filesystem driver and the guest block device. The write hitting the 
various levels of the host-side caches and devices is left up to the 
caching mode. But perhaps there's something obvious I'm not seeing?

Speaking of the caching mode, I now realized that the log superblock is 
probably never written out when cache=unsafe is set, which will need fixing.

>> The implementation is based on the blkverify and blkdebug block drivers.
> 
> This functionality overlaps with existing block drivers.  In the
> long-term people want to observe I/O requests in various ways beyond
> what the dm-log-writes offers.  Therefore I'd like to suggest a
> different approach that extends the quorum driver instead:
> 
> Add a read-pattern=first parameter to the block/quorum.c driver so it
> only reads from the first child.  QEMU can be launched with quorum and
> NBD so that writes and flushes are mirrored to the external NBD server.
> 
> Any file format or online analysis can be implemented in the NBD server.
> There are a number of NBD plugins here:
> https://github.com/libguestfs/nbdkit
> 
> I think sending the writes and flushes over NBD is more flexible.
> 

That sounds interesting! I will look into this possibility.

Thanks,
Ari Sundholm
ari@tuxera.com
Eric Blake June 1, 2018, 3:05 p.m. UTC | #5
On 06/01/2018 08:31 AM, Ari Sundholm wrote:
>>> +++ b/block/blklogwrites.c
>>> @@ -0,0 +1,441 @@
>>> +/*
>>> + * Write logging blk driver based on blkverify and blkdebug.
>>> + *
>>> + * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com>
>>> + * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com>
>>> + * Copyright (c) 2018 Ari Sundholm <ari@tuxera.com>
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 
>>> or later.
>>> + * See the COPYING file in the top-level directory.
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qapi/error.h"
>>> +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
>>> +#include "block/block_int.h"
>>> +#include "qapi/qmp/qdict.h"
>>> +#include "qapi/qmp/qstring.h"
>>> +#include "qemu/cutils.h"
>>> +#include "qemu/option.h"
>>> +
>>> +/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */
>>
>> Your copyright claims GPLv2+; but drivers/md/dm-log-writes.c states 
>> only "* This file is released under the GPL." - I guess that means 
>> you're okay (parts of Linux are GPLv2-only, which we can't copy into a 
>> GPLv2+ file).
>>
> 
> I consulted legal counsel on this part, and it is our opinion that the 
> constants and structs taken from the Linux kernel are insignificant and 
> required for compatibility, so it should be OK to license the file as 
> GPLv2+ in any case. Just to confirm, do you see this differently?

I'm not a lawyer, but I don't see any problem for the code you copied 
from that particular file (that is, any source file that says just "GPL" 
can be interpreted as "GPLv1+", which is therefore okay to copy into a 
"GPLv2+" file).  I was more pointing out that in general, any time you 
copy code but use a different license, you DO have to perform due 
diligence to ensure you aren't treading on dangerous ground.  And the 
fact that you consulted legal counsel is a good thing.  If it were a 
more controversial claim (copying from a "GPLv2-only" file on the ground 
that the copied portion is part of an essential interface and therefore 
not bound by GPLv2-only and therefore okay for inclusion in GPLv2+, then 
I'd have mentioned the legal counsel's advice as part of the commit 
message; but in this case I don't think that's necessary.  Also, we DO 
have files in qemu that are GPLv2-only - we don't like adding more when 
not necessary, but when copying from a GPLv2-only source, it is 
certainly a justifiable action (rather than worrying about whether 
licensing your file as GPLv2+ is violating the intent of the copied 
portions in your file) - but again, I don't see that as something needed 
for this patch, given that your source file did not state GPLv2-only.

>> Do we still want to support yet another legacy syntax addition?  I'd 
>> rather just require the user to pass in a valid --blockdev 
>> specification (which implies that you MUST support a QMP description), 
>> and skip the legacy syntax altogether.
>>
> 
> I believe this was inherited from blkverify, which was used as a basis. 
> I will have to do some research into how to do this properly for this 
> kind of a driver which requires multiple filenames. Any pointers would 
> be appreciated.
>

Look at qapi/block-core.json BlockdevOptionsBlkverify for how to declare 
the QMP type.  Kevin may also have some hints.
Ari Sundholm June 1, 2018, 3:15 p.m. UTC | #6
On 06/01/2018 06:05 PM, Eric Blake wrote:
> On 06/01/2018 08:31 AM, Ari Sundholm wrote:
>>>> +++ b/block/blklogwrites.c
>>>> @@ -0,0 +1,441 @@
>>>> +/*
>>>> + * Write logging blk driver based on blkverify and blkdebug.
>>>> + *
>>>> + * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com>
>>>> + * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com>
>>>> + * Copyright (c) 2018 Ari Sundholm <ari@tuxera.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL, version 2 
>>>> or later.
>>>> + * See the COPYING file in the top-level directory.
>>>> + */
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qapi/error.h"
>>>> +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
>>>> +#include "block/block_int.h"
>>>> +#include "qapi/qmp/qdict.h"
>>>> +#include "qapi/qmp/qstring.h"
>>>> +#include "qemu/cutils.h"
>>>> +#include "qemu/option.h"
>>>> +
>>>> +/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */
>>>
>>> Your copyright claims GPLv2+; but drivers/md/dm-log-writes.c states 
>>> only "* This file is released under the GPL." - I guess that means 
>>> you're okay (parts of Linux are GPLv2-only, which we can't copy into 
>>> a GPLv2+ file).
>>>
>>
>> I consulted legal counsel on this part, and it is our opinion that the 
>> constants and structs taken from the Linux kernel are insignificant 
>> and required for compatibility, so it should be OK to license the file 
>> as GPLv2+ in any case. Just to confirm, do you see this differently?
> 
> I'm not a lawyer, but I don't see any problem for the code you copied 
> from that particular file (that is, any source file that says just "GPL" 
> can be interpreted as "GPLv1+", which is therefore okay to copy into a 
> "GPLv2+" file).  I was more pointing out that in general, any time you 
> copy code but use a different license, you DO have to perform due 
> diligence to ensure you aren't treading on dangerous ground.  And the 
> fact that you consulted legal counsel is a good thing.  If it were a 
> more controversial claim (copying from a "GPLv2-only" file on the ground 
> that the copied portion is part of an essential interface and therefore 
> not bound by GPLv2-only and therefore okay for inclusion in GPLv2+, then 
> I'd have mentioned the legal counsel's advice as part of the commit 
> message; but in this case I don't think that's necessary.  Also, we DO 
> have files in qemu that are GPLv2-only - we don't like adding more when 
> not necessary, but when copying from a GPLv2-only source, it is 
> certainly a justifiable action (rather than worrying about whether 
> licensing your file as GPLv2+ is violating the intent of the copied 
> portions in your file) - but again, I don't see that as something needed 
> for this patch, given that your source file did not state GPLv2-only.
> 
>>> Do we still want to support yet another legacy syntax addition?  I'd 
>>> rather just require the user to pass in a valid --blockdev 
>>> specification (which implies that you MUST support a QMP 
>>> description), and skip the legacy syntax altogether.
>>>
>>
>> I believe this was inherited from blkverify, which was used as a 
>> basis. I will have to do some research into how to do this properly 
>> for this kind of a driver which requires multiple filenames. Any 
>> pointers would be appreciated.
>>
> 
> Look at qapi/block-core.json BlockdevOptionsBlkverify for how to declare 
> the QMP type.  Kevin may also have some hints.
> 

Thank you.

I'll try to get a new, more proper version of the patchset out for 
review/comments next week. Sorry for the sloppiness in this first version!

Thanks,
Ari Sundholm
ari@tuxera.com
Eric Blake June 1, 2018, 3:44 p.m. UTC | #7
On 06/01/2018 10:15 AM, Ari Sundholm wrote:
> Thank you.
> 
> I'll try to get a new, more proper version of the patchset out for 
> review/comments next week. Sorry for the sloppiness in this first version!

Looking forward to it, even if, as Stefan pointed out, we can already do 
everything by use of the existing quorum driver and a custom NBD server.

And don't worry about the patch not being perfect the first time - 
that's just a typical part of the review process.  Although unintended 
on my end, I know that my reviews can be perceived as coming across 
rather negatively, especially when I don't always remember to also 
express my appreciation for the efforts you've already put into writing 
a patch.  The community is always better when we remember to treat 
newcomers (and long-time contributors) nicely, regardless of the outcome 
of the patch review process; and the reason we review things is so that 
however sloppy a first draft was, the final product is a lot better!
Stefan Hajnoczi June 4, 2018, 9:51 a.m. UTC | #8
On Fri, Jun 01, 2018 at 05:24:53PM +0300, Ari Sundholm wrote:
> On 06/01/2018 04:32 PM, Stefan Hajnoczi wrote:
> > On Fri, Jun 01, 2018 at 12:17:19AM +0300, Ari Sundholm wrote:
> > > From: Aapo Vienamo <aapo@tuxera.com>
> > 
> > Thanks for the patch!
> > 
> > > Implements a block device write logging system, similar to Linux kernel
> > > device mapper dm-log-writes. The write operations that are performed
> > > on a block device are logged to a file or another block device. The
> > > write log format is identical to the dm-log-writes format. Currently,
> > > log markers are not supported.
> > > 
> > > This functionality can be used for fail-safe and fs consistency
> > > testing. By implementing it in qemu, tests utilizing write logs can be
> > > be used to test non-Linux drivers and older kernels.
> > 
> > This patch doesn't implement the same semantics as dm-log-writes, where
> > only completed writes are logged to make fs consistency testing easier.
> > If you intend to use it for this purpose, shouldn't it act the same way
> > as dm-log-writes?
> > 
> 
> I am not quite sure what you mean. I am not the original author of this
> proposed feature, but to me (admittedly with little experience of qemu
> internals), it looks like the driver accurately logs the writes and flushes
> performed on the guest block device. It intentionally does not concern
> itself with when the write actually hits the physical host block device or
> file, as we're interested in the direct interactions between a filesystem
> driver and the guest block device. The write hitting the various levels of
> the host-side caches and devices is left up to the caching mode. But perhaps
> there's something obvious I'm not seeing?

Linux dm-log-writes is specific about when logging happens:

 * We log writes only after they have been flushed, this makes the log describe
 * close to the order in which the data hits the actual disk, not its cache.  So
 * for example the following sequence (W means write, C means complete)
 *
 * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
 *
 * Would result in the log looking like this:
 *
 * c,a,flush,fuad,b,<other writes>,<next flush>
 *
 * This is meant to help expose problems where file systems do not properly wait
 * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
 * completes it is added to the log as it should be on disk.

This patch implements the same on-disk format but the semantics are
different since it doesn't wait for a flush.

If I use dm-log-writes on a linear device-mapper target inside the guest
or on the host, then I would have expected the same output as QEMU's
dm-log-writes, but I think this is not the case.

It's worth at least documenting this quirk.

Stefan
Stefan Hajnoczi June 4, 2018, 9:59 a.m. UTC | #9
On Fri, Jun 01, 2018 at 07:26:03AM -0500, Eric Blake wrote:
> On 05/31/2018 04:17 PM, Ari Sundholm wrote:
> > +static void blk_log_writes_co_do_file(void *opaque)
> > +{
> > +    BlkLogWritesFileReq *fr = opaque;
> > +
> > +    fr->file_ret = fr->func(fr);
> > +
> > +    fr->r->done++;
> 
> Two non-atomic increments...
> 
> > +    qemu_coroutine_enter_if_inactive(fr->r->co);
> > +}
> > +
> > +static int coroutine_fn
> > +blk_log_writes_co_log(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
> > +                      QEMUIOVector *qiov, int flags,
> > +                      int (*file_func)(BlkLogWritesFileReq *r),
> > +                      uint64_t entry_flags)
> > +{
> 
> > +    qemu_coroutine_enter(co_file);
> > +    qemu_coroutine_enter(co_log);
> > +
> > +    while (r.done < 2) {
> > +        qemu_coroutine_yield();
> > +    }
> 
> ...used as the condition for waiting.  Since the point of coroutines is to
> allow (restricted) parallel operation, there's a chance that the coroutine
> implementation can be utilizing parallel threads; if that's the case, then
> on the rare race when both threads try to increment at near the same time,
> they can both read 0 and write 1, at which point this wait loop would be an
> infinite loop.  You're probably better off using atomics (even if I'm wrong
> about coroutines being able to race each other on the increment, as the
> other point of coroutines is that they provide restricted parallelism where
> you can also implement them in only a single thread because of well-defined
> yield points).

In this case the coroutines run from a single event loop (the
BlockDriverState's AioContext) so they cannot race.

As QEMU transitions to a multi-queue block layer we will need to think
about parallelism more.  But the multi-queue block layer isn't
implemented yet, so I prefer writing straightforward code now without
trying to anticipate what parallelism issues might arise in the future.

Stefan
Ari Sundholm June 4, 2018, 12:10 p.m. UTC | #10
On 06/04/2018 12:51 PM, Stefan Hajnoczi wrote:
> On Fri, Jun 01, 2018 at 05:24:53PM +0300, Ari Sundholm wrote:
>> On 06/01/2018 04:32 PM, Stefan Hajnoczi wrote:
>>> On Fri, Jun 01, 2018 at 12:17:19AM +0300, Ari Sundholm wrote:
>>>> From: Aapo Vienamo <aapo@tuxera.com>
>>>
>>> Thanks for the patch!
>>>
>>>> Implements a block device write logging system, similar to Linux kernel
>>>> device mapper dm-log-writes. The write operations that are performed
>>>> on a block device are logged to a file or another block device. The
>>>> write log format is identical to the dm-log-writes format. Currently,
>>>> log markers are not supported.
>>>>
>>>> This functionality can be used for fail-safe and fs consistency
>>>> testing. By implementing it in qemu, tests utilizing write logs can be
>>>> be used to test non-Linux drivers and older kernels.
>>>
>>> This patch doesn't implement the same semantics as dm-log-writes, where
>>> only completed writes are logged to make fs consistency testing easier.
>>> If you intend to use it for this purpose, shouldn't it act the same way
>>> as dm-log-writes?
>>>
>>
>> I am not quite sure what you mean. I am not the original author of this
>> proposed feature, but to me (admittedly with little experience of qemu
>> internals), it looks like the driver accurately logs the writes and flushes
>> performed on the guest block device. It intentionally does not concern
>> itself with when the write actually hits the physical host block device or
>> file, as we're interested in the direct interactions between a filesystem
>> driver and the guest block device. The write hitting the various levels of
>> the host-side caches and devices is left up to the caching mode. But perhaps
>> there's something obvious I'm not seeing?
> 
> Linux dm-log-writes is specific about when logging happens:
> 
>   * We log writes only after they have been flushed, this makes the log describe
>   * close to the order in which the data hits the actual disk, not its cache.  So
>   * for example the following sequence (W means write, C means complete)
>   *
>   * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
>   *
>   * Would result in the log looking like this:
>   *
>   * c,a,flush,fuad,b,<other writes>,<next flush>
>   *
>   * This is meant to help expose problems where file systems do not properly wait
>   * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
>   * completes it is added to the log as it should be on disk.
> 
> This patch implements the same on-disk format but the semantics are
> different since it doesn't wait for a flush.
> 
> If I use dm-log-writes on a linear device-mapper target inside the guest
> or on the host, then I would have expected the same output as QEMU's
> dm-log-writes, but I think this is not the case.
> 
> It's worth at least documenting this quirk.

Oh, now I see what you mean.

I was under the impression that when we do the logging at the level it 
is done now, we see the actual writes to the guest block device to 
completion, as far as the guest is concerned (being safely in the host's 
page cache is enough for this). Is this understanding incorrect?

Regarding the issue of waiting for flushes, as it is now, the driver 
only updates the superblock of the log on flush, and everything in the 
log file/device beyond the limits set by the values in the superblock is 
typically ignored as garbage by tools handling dm-log-writes logs. (I 
wonder if dm-log-writes actually works similarly by exploiting this 
fact?) QEMU does a flush on shutdown, though, so, with how the code 
works, all writes will have hit the disk and been logged, unless QEMU 
has crashed or cache=unsafe is on (have to figure out what to do about 
that last bit!).

I guess this at least requires documenting, regardless of whether the 
behavior is correct or not. Thank you for bringing this up!

Thanks,
Ari Sundholm
ari@tuxera.com

> 
> Stefan
>
Stefan Hajnoczi June 7, 2018, 12:30 p.m. UTC | #11
On Mon, Jun 04, 2018 at 03:10:47PM +0300, Ari Sundholm wrote:
> On 06/04/2018 12:51 PM, Stefan Hajnoczi wrote:
> > On Fri, Jun 01, 2018 at 05:24:53PM +0300, Ari Sundholm wrote:
> > > On 06/01/2018 04:32 PM, Stefan Hajnoczi wrote:
> > > > On Fri, Jun 01, 2018 at 12:17:19AM +0300, Ari Sundholm wrote:
> > > > > From: Aapo Vienamo <aapo@tuxera.com>
> > > > 
> > > > Thanks for the patch!
> > > > 
> > > > > Implements a block device write logging system, similar to Linux kernel
> > > > > device mapper dm-log-writes. The write operations that are performed
> > > > > on a block device are logged to a file or another block device. The
> > > > > write log format is identical to the dm-log-writes format. Currently,
> > > > > log markers are not supported.
> > > > > 
> > > > > This functionality can be used for fail-safe and fs consistency
> > > > > testing. By implementing it in qemu, tests utilizing write logs can be
> > > > > be used to test non-Linux drivers and older kernels.
> > > > 
> > > > This patch doesn't implement the same semantics as dm-log-writes, where
> > > > only completed writes are logged to make fs consistency testing easier.
> > > > If you intend to use it for this purpose, shouldn't it act the same way
> > > > as dm-log-writes?
> > > > 
> > > 
> > > I am not quite sure what you mean. I am not the original author of this
> > > proposed feature, but to me (admittedly with little experience of qemu
> > > internals), it looks like the driver accurately logs the writes and flushes
> > > performed on the guest block device. It intentionally does not concern
> > > itself with when the write actually hits the physical host block device or
> > > file, as we're interested in the direct interactions between a filesystem
> > > driver and the guest block device. The write hitting the various levels of
> > > the host-side caches and devices is left up to the caching mode. But perhaps
> > > there's something obvious I'm not seeing?
> > 
> > Linux dm-log-writes is specific about when logging happens:
> > 
> >   * We log writes only after they have been flushed, this makes the log describe
> >   * close to the order in which the data hits the actual disk, not its cache.  So
> >   * for example the following sequence (W means write, C means complete)
> >   *
> >   * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
> >   *
> >   * Would result in the log looking like this:
> >   *
> >   * c,a,flush,fuad,b,<other writes>,<next flush>
> >   *
> >   * This is meant to help expose problems where file systems do not properly wait
> >   * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
> >   * completes it is added to the log as it should be on disk.
> > 
> > This patch implements the same on-disk format but the semantics are
> > different since it doesn't wait for a flush.
> > 
> > If I use dm-log-writes on a linear device-mapper target inside the guest
> > or on the host, then I would have expected the same output as QEMU's
> > dm-log-writes, but I think this is not the case.
> > 
> > It's worth at least documenting this quirk.
> 
> Oh, now I see what you mean.
> 
> I was under the impression that when we do the logging at the level it is
> done now, we see the actual writes to the guest block device to completion,
> as far as the guest is concerned (being safely in the host's page cache is
> enough for this). Is this understanding incorrect?

.bdrv_co_pwritev is called on request submission.
blk_log_writes_co_log() spawns a coroutine for
blk_log_writes_co_do_log(), which appends a struct log_write_entry to
the log file independently of the other coroutine that is doing the
actual write to the underlying file.

This means the entry could be added to the log file before the write
request completes.

If you want to log on request completion then you cannot spawn the log
coroutine in parallel with the write coroutine.  Instead you would have
to complete the write and then update the log.

> Regarding the issue of waiting for flushes, as it is now, the driver only
> updates the superblock of the log on flush, and everything in the log
> file/device beyond the limits set by the values in the superblock is
> typically ignored as garbage by tools handling dm-log-writes logs. (I wonder
> if dm-log-writes actually works similarly by exploiting this fact?)

This does work in the current patch because the log is updated before
the write has completed.

Stefan
Ari Sundholm June 7, 2018, 1:13 p.m. UTC | #12
On 06/07/2018 03:30 PM, Stefan Hajnoczi wrote:
> On Mon, Jun 04, 2018 at 03:10:47PM +0300, Ari Sundholm wrote:
>> On 06/04/2018 12:51 PM, Stefan Hajnoczi wrote:
>>> On Fri, Jun 01, 2018 at 05:24:53PM +0300, Ari Sundholm wrote:
>>>> On 06/01/2018 04:32 PM, Stefan Hajnoczi wrote:
>>>>> On Fri, Jun 01, 2018 at 12:17:19AM +0300, Ari Sundholm wrote:
>>>>>> From: Aapo Vienamo <aapo@tuxera.com>
>>>>>
>>>>> Thanks for the patch!
>>>>>
>>>>>> Implements a block device write logging system, similar to Linux kernel
>>>>>> device mapper dm-log-writes. The write operations that are performed
>>>>>> on a block device are logged to a file or another block device. The
>>>>>> write log format is identical to the dm-log-writes format. Currently,
>>>>>> log markers are not supported.
>>>>>>
>>>>>> This functionality can be used for fail-safe and fs consistency
>>>>>> testing. By implementing it in qemu, tests utilizing write logs can be
>>>>>> be used to test non-Linux drivers and older kernels.
>>>>>
>>>>> This patch doesn't implement the same semantics as dm-log-writes, where
>>>>> only completed writes are logged to make fs consistency testing easier.
>>>>> If you intend to use it for this purpose, shouldn't it act the same way
>>>>> as dm-log-writes?
>>>>>
>>>>
>>>> I am not quite sure what you mean. I am not the original author of this
>>>> proposed feature, but to me (admittedly with little experience of qemu
>>>> internals), it looks like the driver accurately logs the writes and flushes
>>>> performed on the guest block device. It intentionally does not concern
>>>> itself with when the write actually hits the physical host block device or
>>>> file, as we're interested in the direct interactions between a filesystem
>>>> driver and the guest block device. The write hitting the various levels of
>>>> the host-side caches and devices is left up to the caching mode. But perhaps
>>>> there's something obvious I'm not seeing?
>>>
>>> Linux dm-log-writes is specific about when logging happens:
>>>
>>>    * We log writes only after they have been flushed, this makes the log describe
>>>    * close to the order in which the data hits the actual disk, not its cache.  So
>>>    * for example the following sequence (W means write, C means complete)
>>>    *
>>>    * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
>>>    *
>>>    * Would result in the log looking like this:
>>>    *
>>>    * c,a,flush,fuad,b,<other writes>,<next flush>
>>>    *
>>>    * This is meant to help expose problems where file systems do not properly wait
>>>    * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
>>>    * completes it is added to the log as it should be on disk.
>>>
>>> This patch implements the same on-disk format but the semantics are
>>> different since it doesn't wait for a flush.
>>>
>>> If I use dm-log-writes on a linear device-mapper target inside the guest
>>> or on the host, then I would have expected the same output as QEMU's
>>> dm-log-writes, but I think this is not the case.
>>>
>>> It's worth at least documenting this quirk.
>>
>> Oh, now I see what you mean.
>>
>> I was under the impression that when we do the logging at the level it is
>> done now, we see the actual writes to the guest block device to completion,
>> as far as the guest is concerned (being safely in the host's page cache is
>> enough for this). Is this understanding incorrect?
> 
> .bdrv_co_pwritev is called on request submission.
> blk_log_writes_co_log() spawns a coroutine for
> blk_log_writes_co_do_log(), which appends a struct log_write_entry to
> the log file independently of the other coroutine that is doing the
> actual write to the underlying file.
> 
> This means the entry could be added to the log file before the write
> request completes.
> 
> If you want to log on request completion then you cannot spawn the log
> coroutine in parallel with the write coroutine.  Instead you would have
> to complete the write and then update the log.
> 

Thank you. I can't believe I missed that...

Will be fixed in next version by making things sequential.

>> Regarding the issue of waiting for flushes, as it is now, the driver only
>> updates the superblock of the log on flush, and everything in the log
>> file/device beyond the limits set by the values in the superblock is
>> typically ignored as garbage by tools handling dm-log-writes logs. (I wonder
>> if dm-log-writes actually works similarly by exploiting this fact?)
> 
> This does work in the current patch because the log is updated before
> the write has completed.
> 
> Stefan
> 

Best regards,
Ari Sundholm
ari@tuxera.com
diff mbox series

Patch

diff --git a/block.c b/block.c
index 501b64c..c8cffe1 100644
--- a/block.c
+++ b/block.c
@@ -1914,12 +1914,6 @@  int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
     return 0;
 }
 
-#define DEFAULT_PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
-                                 | BLK_PERM_WRITE \
-                                 | BLK_PERM_WRITE_UNCHANGED \
-                                 | BLK_PERM_RESIZE)
-#define DEFAULT_PERM_UNCHANGED (BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH)
-
 void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
                                const BdrvChildRole *role,
                                BlockReopenQueue *reopen_queue,
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 899bfb5..c8337bf 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -5,6 +5,7 @@  block-obj-y += qed-check.o
 block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-y += quorum.o
 block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
+block-obj-y += blklogwrites.o
 block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += file-posix.o
diff --git a/block/blklogwrites.c b/block/blklogwrites.c
new file mode 100644
index 0000000..78835bf
--- /dev/null
+++ b/block/blklogwrites.c
@@ -0,0 +1,441 @@ 
+/*
+ * Write logging blk driver based on blkverify and blkdebug.
+ *
+ * Copyright (c) 2017 Tuomas Tynkkynen <tuomas@tuxera.com>
+ * Copyright (c) 2018 Aapo Vienamo <aapo@tuxera.com>
+ * Copyright (c) 2018 Ari Sundholm <ari@tuxera.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "block/block_int.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/cutils.h"
+#include "qemu/option.h"
+
+/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */
+
+#define LOG_FLUSH_FLAG (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG (1 << 3)
+
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
+
+/* All fields are little-endian. */
+struct log_write_super {
+    uint64_t magic;
+    uint64_t version;
+    uint64_t nr_entries;
+    uint32_t sectorsize;
+};
+
+struct log_write_entry {
+    uint64_t sector;
+    uint64_t nr_sectors;
+    uint64_t flags;
+    uint64_t data_len;
+};
+
+/* End of disk format structures. */
+
+typedef struct {
+    BdrvChild *log_file;
+    uint64_t cur_log_sector;
+    uint64_t nr_entries;
+} BDRVBlkLogWritesState;
+
+/* Valid blk_log_writes filenames look like:
+ *      blk_log_writes:path/to/raw_image:path/to/logfile */
+static void blk_log_writes_parse_filename(const char *filename, QDict *options,
+                                          Error **errp)
+{
+    const char *c;
+    QString *raw_path;
+
+    /* Parse the blk_log_writes: prefix */
+    if (!strstart(filename, "blk_log_writes:", &filename)) {
+        /* There was no prefix; therefore, all options have to be already
+         * present in the QDict (except for the filename) */
+        qdict_put(options, "x-log", qstring_from_str(filename));
+        return;
+    }
+
+    /* Parse the raw image filename */
+    c = strchr(filename, ':');
+    if (c == NULL) {
+        error_setg(errp,
+                   "blk_log_writes requires paths to both image and log");
+        return;
+    }
+
+    raw_path = qstring_from_substr(filename, 0, c - filename - 1);
+    qdict_put(options, "x-raw", raw_path);
+
+    filename = c + 1;
+    qdict_put(options, "x-log", qstring_from_str(filename));
+}
+
+static QemuOptsList runtime_opts = {
+    .name = "blk_log_writes",
+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+    .desc = {
+        {
+            .name = "x-raw",
+            .type = QEMU_OPT_STRING,
+            .help = "[internal use only, will be removed]",
+        },
+        {
+            .name = "x-log",
+            .type = QEMU_OPT_STRING,
+            .help = "[internal use only, will be removed]",
+        },
+        { /* end of list */ }
+    },
+};
+
+static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags,
+                               Error **errp)
+{
+    BDRVBlkLogWritesState *s = bs->opaque;
+    QemuOpts *opts;
+    Error *local_err = NULL;
+    int ret;
+
+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+    qemu_opts_absorb_qdict(opts, options, &local_err);
+    if (local_err) {
+        ret = -EINVAL;
+        error_propagate(errp, local_err);
+        goto fail;
+    }
+
+    /* Open the raw file */
+    bs->file = bdrv_open_child(qemu_opt_get(opts, "x-raw"), options, "raw",
+                               bs, &child_file, false, &local_err);
+    if (local_err) {
+        ret = -EINVAL;
+        error_propagate(errp, local_err);
+        goto fail;
+    }
+
+    s->cur_log_sector = 1;
+    s->nr_entries = 0;
+    qdict_put_str(options, "log.driver", "raw");
+    qdict_put_str(options, "log.read-only", "off");
+
+    /* Open the log file */
+    s->log_file = bdrv_open_child(qemu_opt_get(opts, "x-log"), options,
+                                  "log", bs, &child_file, false,
+                                  &local_err);
+    if (local_err) {
+        ret = -EINVAL;
+        error_propagate(errp, local_err);
+        goto fail;
+    }
+
+    ret = 0;
+fail:
+    if (ret < 0) {
+        bdrv_unref_child(bs, bs->file);
+        bs->file = NULL;
+    }
+    qemu_opts_del(opts);
+    return ret;
+}
+
+static void blk_log_writes_close(BlockDriverState *bs)
+{
+    BDRVBlkLogWritesState *s = bs->opaque;
+
+    bdrv_unref_child(bs, s->log_file);
+    s->log_file = NULL;
+}
+
+static int64_t blk_log_writes_getlength(BlockDriverState *bs)
+{
+    return bdrv_getlength(bs->file->bs);
+}
+
+static void blk_log_writes_refresh_filename(BlockDriverState *bs,
+                                            QDict *options)
+{
+    BDRVBlkLogWritesState *s = bs->opaque;
+
+    /* bs->file->bs has already been refreshed */
+    bdrv_refresh_filename(s->log_file->bs);
+
+    if (bs->file->bs->full_open_options
+        && s->log_file->bs->full_open_options)
+    {
+        QDict *opts = qdict_new();
+        qdict_put_obj(opts, "driver",
+                      QOBJECT(qstring_from_str("blk_log_writes")));
+
+        qobject_ref(bs->file->bs->full_open_options);
+        qdict_put_obj(opts, "raw", QOBJECT(bs->file->bs->full_open_options));
+        qobject_ref(s->log_file->bs->full_open_options);
+        qdict_put_obj(opts, "log",
+                      QOBJECT(s->log_file->bs->full_open_options));
+
+        bs->full_open_options = opts;
+    }
+
+    if (bs->file->bs->exact_filename[0]
+        && s->log_file->bs->exact_filename[0])
+    {
+        snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+                 "blk_log_writes:%s:%s",
+                 bs->file->bs->exact_filename,
+                 s->log_file->bs->exact_filename);
+    }
+}
+
+static void blk_log_writes_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                      const BdrvChildRole *role,
+                                      BlockReopenQueue *ro_q,
+                                      uint64_t perm, uint64_t shrd,
+                                      uint64_t *nperm, uint64_t *nshrd)
+{
+    if (!c) {
+        *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
+        *nshrd = (shrd & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
+        return;
+    }
+
+    if (!strcmp(c->name, "log"))
+        bdrv_format_default_perms(bs, c, role, ro_q, perm, shrd, nperm, nshrd);
+    else
+        bdrv_filter_default_perms(bs, c, role, ro_q, perm, shrd, nperm, nshrd);
+}
+
+static void blk_log_writes_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    if (bs->bl.request_alignment < BDRV_SECTOR_SIZE) {
+        bs->bl.request_alignment = BDRV_SECTOR_SIZE;
+
+        if (bs->bl.pdiscard_alignment &&
+                bs->bl.pdiscard_alignment < bs->bl.request_alignment)
+            bs->bl.pdiscard_alignment = bs->bl.request_alignment;
+        if (bs->bl.pwrite_zeroes_alignment &&
+                bs->bl.pwrite_zeroes_alignment < bs->bl.request_alignment)
+            bs->bl.pwrite_zeroes_alignment = bs->bl.request_alignment;
+    }
+}
+
+static int coroutine_fn
+blk_log_writes_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                         QEMUIOVector *qiov, int flags)
+{
+    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
+}
+
+typedef struct BlkLogWritesReq {
+    Coroutine *co;
+    BlockDriverState *bs;
+    unsigned done;
+} BlkLogWritesReq;
+
+typedef struct BlkLogWritesFileReq {
+    BlkLogWritesReq *r;
+    uint64_t offset;
+    uint64_t bytes;
+    int file_flags;
+    QEMUIOVector *qiov;
+    int (*func)(struct BlkLogWritesFileReq *r);
+    int file_ret;
+} BlkLogWritesFileReq;
+
+typedef struct {
+    BlkLogWritesReq *r;
+    QEMUIOVector *qiov;
+    struct log_write_entry entry;
+    int log_ret;
+} BlkLogWritesLogReq;
+
+static void coroutine_fn blk_log_writes_co_do_log(void *opaque)
+{
+    BlkLogWritesLogReq *lr = opaque;
+    BDRVBlkLogWritesState *s = lr->r->bs->opaque;
+    uint64_t cur_log_offset = s->cur_log_sector * BDRV_SECTOR_SIZE;
+
+    s->nr_entries++;
+    s->cur_log_sector +=
+            ROUND_UP(lr->qiov->size, BDRV_SECTOR_SIZE) >> BDRV_SECTOR_BITS;
+
+    lr->log_ret = bdrv_co_pwritev(s->log_file, cur_log_offset, lr->qiov->size,
+                                  lr->qiov, 0);
+
+    /* Update super block on flush */
+    if (lr->log_ret == 0 && lr->entry.flags & LOG_FLUSH_FLAG) {
+        struct log_write_super super = {
+            .magic      = cpu_to_le64(WRITE_LOG_MAGIC),
+            .version    = cpu_to_le64(WRITE_LOG_VERSION),
+            .nr_entries = cpu_to_le64(s->nr_entries),
+            .sectorsize = cpu_to_le32(1 << BDRV_SECTOR_BITS),
+        };
+        static const char zeroes[BDRV_SECTOR_SIZE - sizeof(super)] = { '\0' };
+        QEMUIOVector qiov;
+
+        qemu_iovec_init(&qiov, 2);
+        qemu_iovec_add(&qiov, &super, sizeof(super));
+        qemu_iovec_add(&qiov, (void *)zeroes, sizeof(zeroes));
+
+        lr->log_ret =
+            bdrv_co_pwritev(s->log_file, 0, BDRV_SECTOR_SIZE, &qiov, 0);
+        if (lr->log_ret == 0)
+            lr->log_ret = bdrv_co_flush(s->log_file->bs);
+        qemu_iovec_destroy(&qiov);
+    }
+
+    lr->r->done++;
+    qemu_coroutine_enter_if_inactive(lr->r->co);
+}
+
+static void blk_log_writes_co_do_file(void *opaque)
+{
+    BlkLogWritesFileReq *fr = opaque;
+
+    fr->file_ret = fr->func(fr);
+
+    fr->r->done++;
+    qemu_coroutine_enter_if_inactive(fr->r->co);
+}
+
+static int coroutine_fn
+blk_log_writes_co_log(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                      QEMUIOVector *qiov, int flags,
+                      int (*file_func)(BlkLogWritesFileReq *r),
+                      uint64_t entry_flags)
+{
+    QEMUIOVector log_qiov;
+    size_t niov = qiov ? qiov->niov : 0;
+    Coroutine *co_log, *co_file;
+    BlkLogWritesReq r = {
+        .co     = qemu_coroutine_self(),
+        .bs     = bs,
+        .done   = 0,
+    };
+    BlkLogWritesFileReq fr = {
+        .r      = &r,
+        .offset = offset,
+        .bytes  = bytes,
+        .qiov   = qiov,
+        .func   = file_func,
+    };
+    BlkLogWritesLogReq lr = {
+        .r              = &r,
+        .qiov           = &log_qiov,
+        .entry = {
+            .sector     = cpu_to_le64(offset >> BDRV_SECTOR_BITS),
+            .nr_sectors = cpu_to_le64(bytes >> BDRV_SECTOR_BITS),
+            .flags      = cpu_to_le64(entry_flags),
+            .data_len   = 0,
+        },
+    };
+    static const char zeroes[BDRV_SECTOR_SIZE - sizeof(struct log_write_entry)]
+        = { '\0' };
+
+    assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+    assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+
+    qemu_iovec_init(&log_qiov, niov + 2);
+    qemu_iovec_add(&log_qiov, &lr.entry, sizeof(lr.entry));
+    qemu_iovec_add(&log_qiov, (void *)zeroes, sizeof(zeroes));
+    for (size_t i = 0; i < niov; ++i) {
+        qemu_iovec_add(&log_qiov, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+    }
+
+    co_log = qemu_coroutine_create(blk_log_writes_co_do_log, &lr);
+    co_file = qemu_coroutine_create(blk_log_writes_co_do_file, &fr);
+
+    qemu_coroutine_enter(co_file);
+    qemu_coroutine_enter(co_log);
+
+    while (r.done < 2) {
+        qemu_coroutine_yield();
+    }
+
+    qemu_iovec_destroy(&log_qiov);
+
+    if (lr.log_ret < 0) {
+        return lr.log_ret;
+    }
+
+    return fr.file_ret;
+}
+
+static int coroutine_fn
+blk_log_writes_co_do_file_pwritev(BlkLogWritesFileReq *fr)
+{
+    return bdrv_co_pwritev(fr->r->bs->file, fr->offset, fr->bytes,
+                           fr->qiov, fr->file_flags);
+}
+
+static int coroutine_fn blk_log_writes_co_do_file_flush(BlkLogWritesFileReq *fr)
+{
+    return bdrv_co_flush(fr->r->bs->file->bs);
+}
+
+static int coroutine_fn
+blk_log_writes_co_do_file_pdiscard(BlkLogWritesFileReq *fr)
+{
+    return bdrv_co_pdiscard(fr->r->bs->file->bs, fr->offset, fr->bytes);
+}
+
+static int coroutine_fn
+blk_log_writes_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+                          QEMUIOVector *qiov, int flags)
+{
+    return blk_log_writes_co_log(bs, offset, bytes, qiov, flags,
+                                 blk_log_writes_co_do_file_pwritev, 0);
+}
+
+static int coroutine_fn blk_log_writes_co_flush_to_disk(BlockDriverState *bs)
+{
+    return blk_log_writes_co_log(bs, 0, 0, NULL, 0,
+                                 blk_log_writes_co_do_file_flush,
+                                 LOG_FLUSH_FLAG);
+}
+
+static int coroutine_fn
+blk_log_writes_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
+{
+    return blk_log_writes_co_log(bs, offset, count, NULL, 0,
+                                 blk_log_writes_co_do_file_pdiscard,
+                                 LOG_DISCARD_FLAG);
+}
+
+static BlockDriver bdrv_blk_log_writes = {
+    .format_name            = "blk_log_writes",
+    .protocol_name          = "blk_log_writes",
+    .instance_size          = sizeof(BDRVBlkLogWritesState),
+
+    .bdrv_parse_filename    = blk_log_writes_parse_filename,
+    .bdrv_file_open         = blk_log_writes_open,
+    .bdrv_close             = blk_log_writes_close,
+    .bdrv_getlength         = blk_log_writes_getlength,
+    .bdrv_refresh_filename  = blk_log_writes_refresh_filename,
+    .bdrv_child_perm        = blk_log_writes_child_perm,
+    .bdrv_refresh_limits    = blk_log_writes_refresh_limits,
+
+    .bdrv_co_preadv         = blk_log_writes_co_preadv,
+    .bdrv_co_pwritev        = blk_log_writes_co_pwritev,
+    .bdrv_co_flush_to_disk  = blk_log_writes_co_flush_to_disk,
+    .bdrv_co_pdiscard       = blk_log_writes_co_pdiscard,
+
+    .is_filter              = true,
+};
+
+static void bdrv_blk_log_writes_init(void)
+{
+    bdrv_register(&bdrv_blk_log_writes);
+}
+
+block_init(bdrv_blk_log_writes_init);
diff --git a/include/block/block.h b/include/block/block.h
index 3894edd..fb7d379 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -225,6 +225,13 @@  enum {
     BLK_PERM_GRAPH_MOD          = 0x10,
 
     BLK_PERM_ALL                = 0x1f,
+
+    DEFAULT_PERM_PASSTHROUGH    = BLK_PERM_CONSISTENT_READ
+                                 | BLK_PERM_WRITE
+                                 | BLK_PERM_WRITE_UNCHANGED
+                                 | BLK_PERM_RESIZE,
+
+    DEFAULT_PERM_UNCHANGED      = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
 };
 
 char *bdrv_perm_names(uint64_t perm);