diff mbox series

[v8,4/7] block: introduce backup-top filter driver

Message ID 20190529154654.95870-5-vsementsov@virtuozzo.com
State New
Headers show
Series backup-top filter driver for backup | expand

Commit Message

Vladimir Sementsov-Ogievskiy May 29, 2019, 3:46 p.m. UTC
Backup-top filter does copy-before-write operation. It should be
inserted above active disk and has a target node for CBW, like the
following:

    +-------+
    | Guest |
    +-------+
        |r,w
        v
    +------------+  target   +---------------+
    | backup_top |---------->| target(qcow2) |
    +------------+   CBW     +---------------+
        |
backing |r,w
        v
    +-------------+
    | Active disk |
    +-------------+

The driver will be used in backup instead of write-notifiers.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 block/backup-top.h  |  64 +++++++++
 block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
 block/Makefile.objs |   2 +
 3 files changed, 388 insertions(+)
 create mode 100644 block/backup-top.h
 create mode 100644 block/backup-top.c

Comments

Max Reitz June 13, 2019, 3:57 p.m. UTC | #1
On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
> Backup-top filter does copy-before-write operation. It should be
> inserted above active disk and has a target node for CBW, like the
> following:
> 
>     +-------+
>     | Guest |
>     +-------+
>         |r,w
>         v
>     +------------+  target   +---------------+
>     | backup_top |---------->| target(qcow2) |
>     +------------+   CBW     +---------------+
>         |
> backing |r,w
>         v
>     +-------------+
>     | Active disk |
>     +-------------+
> 
> The driver will be used in backup instead of write-notifiers.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  block/backup-top.h  |  64 +++++++++
>  block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>  block/Makefile.objs |   2 +
>  3 files changed, 388 insertions(+)
>  create mode 100644 block/backup-top.h
>  create mode 100644 block/backup-top.c
> 
> diff --git a/block/backup-top.h b/block/backup-top.h
> new file mode 100644
> index 0000000000..788e18c358
> --- /dev/null
> +++ b/block/backup-top.h
> @@ -0,0 +1,64 @@
> +/*
> + * backup-top filter driver
> + *
> + * The driver performs Copy-Before-Write (CBW) operation: it is injected above
> + * some node, and before each write it copies _old_ data to the target node.
> + *
> + * Copyright (c) 2018 Virtuozzo International GmbH. All rights reserved.
> + *
> + * Author:
> + *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef BACKUP_TOP_H
> +#define BACKUP_TOP_H
> +
> +#include "qemu/osdep.h"
> +
> +#include "block/block_int.h"
> +
> +typedef void (*BackupTopProgressCallback)(uint64_t done, void *opaque);
> +typedef struct BDRVBackupTopState {
> +    HBitmap *copy_bitmap; /* what should be copied to @target on guest write. */
> +    BdrvChild *target;
> +
> +    BackupTopProgressCallback progress_cb;
> +    void *progress_opaque;
> +} BDRVBackupTopState;
> +
> +/*
> + * bdrv_backup_top_append
> + *
> + * Append backup_top filter node above @source node. @target node will receive
> + * the data backed up during CBE operations. New filter together with @target
> + * node are attached to @source aio context.
> + *
> + * The resulting filter node is implicit.

Why?  It’s just as easy for the caller to just make it implicit if it
should be.  (And usually the caller should decide that.)

> + *
> + * @copy_bitmap selects regions which needs CBW. Furthermore, backup_top will
> + * use exactly this bitmap, so it may be used to control backup_top behavior
> + * dynamically. Caller should not release @copy_bitmap during life-time of
> + * backup_top. Progress is tracked by calling @progress_cb function.
> + */
> +BlockDriverState *bdrv_backup_top_append(
> +        BlockDriverState *source, BlockDriverState *target,
> +        HBitmap *copy_bitmap, Error **errp);
> +void bdrv_backup_top_set_progress_callback(
> +        BlockDriverState *bs, BackupTopProgressCallback progress_cb,
> +        void *progress_opaque);
> +void bdrv_backup_top_drop(BlockDriverState *bs);
> +
> +#endif /* BACKUP_TOP_H */
> diff --git a/block/backup-top.c b/block/backup-top.c
> new file mode 100644
> index 0000000000..1daa02f539
> --- /dev/null
> +++ b/block/backup-top.c
> @@ -0,0 +1,322 @@
> +/*
> + * backup-top filter driver
> + *
> + * The driver performs Copy-Before-Write (CBW) operation: it is injected above
> + * some node, and before each write it copies _old_ data to the target node.
> + *
> + * Copyright (c) 2018 Virtuozzo International GmbH. All rights reserved.
> + *
> + * Author:
> + *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "qemu/osdep.h"
> +
> +#include "qemu/cutils.h"
> +#include "qapi/error.h"
> +#include "block/block_int.h"
> +#include "block/qdict.h"
> +
> +#include "block/backup-top.h"
> +
> +static coroutine_fn int backup_top_co_preadv(
> +        BlockDriverState *bs, uint64_t offset, uint64_t bytes,
> +        QEMUIOVector *qiov, int flags)
> +{
> +    /*
> +     * Features to be implemented:
> +     * F1. COR. save read data to fleecing target for fast access
> +     *     (to reduce reads). This possibly may be done with use of copy-on-read
> +     *     filter, but we need an ability to make COR requests optional: for
> +     *     example, if target is a ram-cache, and if it is full now, we should
> +     *     skip doing COR request, as it is actually not necessary.
> +     *
> +     * F2. Feature for guest: read from fleecing target if data is in ram-cache
> +     *     and is unchanged
> +     */
> +
> +    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
> +}
> +
> +static coroutine_fn int backup_top_cbw(BlockDriverState *bs, uint64_t offset,
> +                                       uint64_t bytes)
> +{
> +    int ret = 0;
> +    BDRVBackupTopState *s = bs->opaque;
> +    uint64_t gran = 1UL << hbitmap_granularity(s->copy_bitmap);
> +    uint64_t end = QEMU_ALIGN_UP(offset + bytes, gran);
> +    uint64_t off = QEMU_ALIGN_DOWN(offset, gran), len;

The “, len” is a bit weirdly placed there, why not define it on a
separate line as "uint64_t len = end - off"?

> +    void *buf = qemu_blockalign(bs, end - off);
> +
> +    /*
> +     * Features to be implemented:
> +     * F3. parallelize copying loop
> +     * F4. detect zeroes ? or, otherwise, drop detect zeroes from backup code
> +     *     and just enable zeroes detecting on target
> +     * F5. use block_status ?
> +     * F6. don't copy clusters which are already cached by COR [see F1]
> +     * F7. if target is ram-cache and it is full, there should be a possibility
> +     *     to drop not necessary data (cached by COR [see F1]) to handle CBW
> +     *     fast.

Also “drop BDRV_REQ_SERIALISING from writes to s->target unless necessary”?

> +     */
> +
> +    len = end - off;
> +    while (hbitmap_next_dirty_area(s->copy_bitmap, &off, &len)) {
> +        hbitmap_reset(s->copy_bitmap, off, len);
> +
> +        ret = bdrv_co_pread(bs->backing, off, len, buf,
> +                            BDRV_REQ_NO_SERIALISING);
> +        if (ret < 0) {
> +            hbitmap_set(s->copy_bitmap, off, len);
> +            goto out;
> +        }
> +
> +        ret = bdrv_co_pwrite(s->target, off, len, buf, BDRV_REQ_SERIALISING);
> +        if (ret < 0) {
> +            hbitmap_set(s->copy_bitmap, off, len);
> +            goto out;
> +        }
> +
> +        if (s->progress_cb) {
> +            s->progress_cb(len, s->progress_opaque);
> +        }
> +        off += len;
> +        if (off >= end) {
> +            break;
> +        }
> +        len = end - off;
> +    }
> +
> +out:
> +    qemu_vfree(buf);
> +
> +    /*
> +     * F8. we fail guest request in case of error. We can alter it by
> +     * possibility to fail copying process instead, or retry several times, or
> +     * may be guest pause, etc.
> +     */
> +    return ret;
> +}
> +
> +static int coroutine_fn backup_top_co_pdiscard(BlockDriverState *bs,
> +                                               int64_t offset, int bytes)
> +{
> +    int ret = backup_top_cbw(bs, offset, bytes);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    /*
> +     * Features to be implemented:
> +     * F9. possibility of lazy discard: just defer the discard after fleecing
> +     *     completion. If write (or new discard) occurs to the same area, just
> +     *     drop deferred discard.
> +     */
> +
> +    return bdrv_co_pdiscard(bs->backing, offset, bytes);
> +}
> +
> +static int coroutine_fn backup_top_co_pwrite_zeroes(BlockDriverState *bs,
> +        int64_t offset, int bytes, BdrvRequestFlags flags)
> +{
> +    int ret = backup_top_cbw(bs, offset, bytes);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
> +}
> +
> +static coroutine_fn int backup_top_co_pwritev(BlockDriverState *bs,
> +                                              uint64_t offset,
> +                                              uint64_t bytes,
> +                                              QEMUIOVector *qiov, int flags)
> +{
> +    if (!(flags & BDRV_REQ_WRITE_UNCHANGED)) {
> +        int ret = backup_top_cbw(bs, offset, bytes);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +    }
> +
> +    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
> +}
> +
> +static int coroutine_fn backup_top_co_flush(BlockDriverState *bs)
> +{
> +    if (!bs->backing) {
> +        return 0;
> +    }
> +
> +    return bdrv_co_flush(bs->backing->bs);

Should we flush the target, too?

> +}
> +
> +static void backup_top_refresh_filename(BlockDriverState *bs)
> +{
> +    if (bs->backing == NULL) {
> +        /*
> +         * we can be here after failed bdrv_attach_child in
> +         * bdrv_set_backing_hd
> +         */
> +        return;
> +    }
> +    bdrv_refresh_filename(bs->backing->bs);

bdrv_refresh_filename() should have done this already.

> +    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
> +            bs->backing->bs->filename);
> +}
> +
> +static void backup_top_child_perm(BlockDriverState *bs, BdrvChild *c,
> +                                  const BdrvChildRole *role,
> +                                  BlockReopenQueue *reopen_queue,
> +                                  uint64_t perm, uint64_t shared,
> +                                  uint64_t *nperm, uint64_t *nshared)
> +{
> +    /*
> +     * We have HBitmap in the state, its size is fixed, so we never allow
> +     * resize.
> +     */
> +    uint64_t rw = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> +                  BLK_PERM_WRITE;
> +
> +    bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
> +                              nperm, nshared);
> +
> +    *nperm = *nperm & rw;
> +    *nshared = *nshared & rw;

Somehow, that bdrv_filter_default_perm() call doesn’t make this function
easier for me.

It seems like this is basically just “*nperm = perm & rw” and
“*nshared = shared & rw”.

> +
> +    if (role == &child_file) {
> +        /*
> +         * Target child
> +         *
> +         * Share write to target (child_file), to not interfere
> +         * with guest writes to its disk which may be in target backing chain.
> +         */
> +        if (perm & BLK_PERM_WRITE) {
> +            *nshared = *nshared | BLK_PERM_WRITE;

Why not always share WRITE on the target?

> +        }
> +    } else {
> +        /* Source child */
> +        if (perm & BLK_PERM_WRITE) {

Or WRITE_UNCHANGED, no?

> +            *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
> +        }
> +        *nshared =
> +            *nshared & (BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED);

And here it isn’t “*nshared = shared & rw” but “rw & ~WRITE”.

I feel like this function would be simpler if you just set *nperm and
*nshared separately in the two branches of this condition, without
setting a “default” first.

> +    }
> +}
> +
> +BlockDriver bdrv_backup_top_filter = {
> +    .format_name = "backup-top",
> +    .instance_size = sizeof(BDRVBackupTopState),
> +
> +    .bdrv_co_preadv             = backup_top_co_preadv,
> +    .bdrv_co_pwritev            = backup_top_co_pwritev,
> +    .bdrv_co_pwrite_zeroes      = backup_top_co_pwrite_zeroes,
> +    .bdrv_co_pdiscard           = backup_top_co_pdiscard,
> +    .bdrv_co_flush              = backup_top_co_flush,
> +
> +    .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
> +
> +    .bdrv_refresh_filename      = backup_top_refresh_filename,
> +
> +    .bdrv_child_perm            = backup_top_child_perm,
> +
> +    .is_filter = true,
> +};
> +
> +BlockDriverState *bdrv_backup_top_append(BlockDriverState *source,
> +                                         BlockDriverState *target,
> +                                         HBitmap *copy_bitmap,
> +                                         Error **errp)
> +{
> +    Error *local_err = NULL;
> +    BDRVBackupTopState *state;
> +    BlockDriverState *top = bdrv_new_open_driver(&bdrv_backup_top_filter,
> +                                                 NULL, BDRV_O_RDWR, errp);
> +
> +    if (!top) {
> +        return NULL;
> +    }
> +
> +    top->implicit = true;

As I said above, I think the caller should set this.

> +    top->total_sectors = source->total_sectors;
> +    top->bl.opt_mem_alignment = MAX(bdrv_opt_mem_align(source),
> +                                    bdrv_opt_mem_align(target));
> +    top->opaque = state = g_new0(BDRVBackupTopState, 1);
> +    state->copy_bitmap = copy_bitmap;
> +
> +    bdrv_ref(target);
> +    state->target = bdrv_attach_child(top, target, "target", &child_file, errp);
> +    if (!state->target) {
> +        bdrv_unref(target);
> +        bdrv_unref(top);
> +        return NULL;
> +    }
> +
> +    bdrv_set_aio_context(top, bdrv_get_aio_context(source));
> +    bdrv_set_aio_context(target, bdrv_get_aio_context(source));

I suppose these calls aren’t necessary anymore (compare d0ee0204f4009).
 (I’m not fully sure, though.  In any case, they would need to be
bdrv_try_set_aio_context() if they were still necessary.)

> +
> +    bdrv_drained_begin(source);
> +
> +    bdrv_ref(top);
> +    bdrv_append(top, source, &local_err);
> +    if (local_err) {
> +        error_prepend(&local_err, "Cannot append backup-top filter: ");
> +    }
> +
> +    bdrv_drained_end(source);
> +
> +    if (local_err) {
> +        bdrv_unref_child(top, state->target);
> +        bdrv_unref(top);
> +        error_propagate(errp, local_err);
> +        return NULL;
> +    }
> +
> +    return top;
> +}

I guess it would be nice if it users could blockdev-add a backup-top
node to basically get a backup with sync=none.

(The bdrv_open implementation should then create a new bitmap and
initialize it to be fully set.)

But maybe it wouldn’t be so nice and I just have feverish dreams.

> +void bdrv_backup_top_set_progress_callback(
> +        BlockDriverState *bs, BackupTopProgressCallback progress_cb,
> +        void *progress_opaque)
> +{
> +    BDRVBackupTopState *s = bs->opaque;
> +
> +    s->progress_cb = progress_cb;
> +    s->progress_opaque = progress_opaque;
> +}
> +
> +void bdrv_backup_top_drop(BlockDriverState *bs)
> +{
> +    BDRVBackupTopState *s = bs->opaque;
> +    AioContext *aio_context = bdrv_get_aio_context(bs);
> +
> +    aio_context_acquire(aio_context);
> +
> +    bdrv_drained_begin(bs);
> +
> +    bdrv_child_try_set_perm(bs->backing, 0, BLK_PERM_ALL, &error_abort);

Pre-existing in other jobs, but I think calling this function is
dangerous.  (Which is why I remove it in my “block: Ignore loosening
perm restrictions failures” series.)

> +    bdrv_replace_node(bs, backing_bs(bs), &error_abort);
> +    bdrv_set_backing_hd(bs, NULL, &error_abort);

I think some of this function should be in a .bdrv_close()
implementation, for example this bdrv_set_backing_hd() call.

> +    bdrv_drained_end(bs);
> +
> +    if (s->target) {
> +        bdrv_unref_child(bs, s->target);

And this.  Well, neither needs to be in a .bdrv_close() implementation,
actually, because bdrv_close() will just do it by itself.

I suppose you’re trying to remove the children so the node is no longer
usable after this point; but that isn’t quite right, then.  The filter
functions still refer to s->target and bs->backing, so you need to stop
them from doing anything then.

At this point, you might as well add a variable to BDRVBackupTopState
that says whether the filter is still supposed to be usable (like the
“stop” variable I added to mirror in “block/mirror: Fix child
permissions”).  If so, the permission function should signal 0/ALL for
the permissions on backing (and probably target), and all filter
functions always immediately return an error.

So I don’t think backing and target need to be removed here.  We can
wait for that until bdrv_close(), but we should ensure that the filter
really is unusable after this function has been called.

Max

> +    }
> +    bdrv_unref(bs);
> +
> +    aio_context_release(aio_context);
> +}
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index ae11605c9f..dfbdfe6ab4 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -40,6 +40,8 @@ block-obj-y += throttle.o copy-on-read.o
>  
>  block-obj-y += crypto.o
>  
> +block-obj-y += backup-top.o
> +
>  common-obj-y += stream.o
>  
>  nfs.o-libs         := $(LIBNFS_LIBS)
>
Vladimir Sementsov-Ogievskiy June 14, 2019, 9:04 a.m. UTC | #2
13.06.2019 18:57, Max Reitz wrote:
> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
>> Backup-top filter does copy-before-write operation. It should be
>> inserted above active disk and has a target node for CBW, like the
>> following:
>>
>>      +-------+
>>      | Guest |
>>      +-------+
>>          |r,w
>>          v
>>      +------------+  target   +---------------+
>>      | backup_top |---------->| target(qcow2) |
>>      +------------+   CBW     +---------------+
>>          |
>> backing |r,w
>>          v
>>      +-------------+
>>      | Active disk |
>>      +-------------+
>>
>> The driver will be used in backup instead of write-notifiers.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   block/backup-top.h  |  64 +++++++++
>>   block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>>   block/Makefile.objs |   2 +
>>   3 files changed, 388 insertions(+)
>>   create mode 100644 block/backup-top.h
>>   create mode 100644 block/backup-top.c
>>
>> diff --git a/block/backup-top.h b/block/backup-top.h
>> new file mode 100644
>> index 0000000000..788e18c358
>> --- /dev/null
>> +++ b/block/backup-top.h
>> @@ -0,0 +1,64 @@
>> +/*
>> + * backup-top filter driver
>> + *
>> + * The driver performs Copy-Before-Write (CBW) operation: it is injected above
>> + * some node, and before each write it copies _old_ data to the target node.
>> + *
>> + * Copyright (c) 2018 Virtuozzo International GmbH. All rights reserved.
>> + *
>> + * Author:
>> + *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef BACKUP_TOP_H
>> +#define BACKUP_TOP_H
>> +
>> +#include "qemu/osdep.h"
>> +
>> +#include "block/block_int.h"
>> +
>> +typedef void (*BackupTopProgressCallback)(uint64_t done, void *opaque);
>> +typedef struct BDRVBackupTopState {
>> +    HBitmap *copy_bitmap; /* what should be copied to @target on guest write. */
>> +    BdrvChild *target;
>> +
>> +    BackupTopProgressCallback progress_cb;
>> +    void *progress_opaque;
>> +} BDRVBackupTopState;
>> +
>> +/*
>> + * bdrv_backup_top_append
>> + *
>> + * Append backup_top filter node above @source node. @target node will receive
>> + * the data backed up during CBE operations. New filter together with @target
>> + * node are attached to @source aio context.
>> + *
>> + * The resulting filter node is implicit.
> 
> Why?  It’s just as easy for the caller to just make it implicit if it
> should be.  (And usually the caller should decide that.)

Personally, I don't know what are the reasons for filters to bi implicit or not,
I just made it like other job-filters.. I can move making-implicit to the caller
or drop it at all (if it will work).

> 
>> + *
>> + * @copy_bitmap selects regions which needs CBW. Furthermore, backup_top will
>> + * use exactly this bitmap, so it may be used to control backup_top behavior
>> + * dynamically. Caller should not release @copy_bitmap during life-time of
>> + * backup_top. Progress is tracked by calling @progress_cb function.
>> + */
>> +BlockDriverState *bdrv_backup_top_append(
>> +        BlockDriverState *source, BlockDriverState *target,
>> +        HBitmap *copy_bitmap, Error **errp);
>> +void bdrv_backup_top_set_progress_callback(
>> +        BlockDriverState *bs, BackupTopProgressCallback progress_cb,
>> +        void *progress_opaque);
>> +void bdrv_backup_top_drop(BlockDriverState *bs);
>> +
>> +#endif /* BACKUP_TOP_H */
>> diff --git a/block/backup-top.c b/block/backup-top.c
>> new file mode 100644
>> index 0000000000..1daa02f539
>> --- /dev/null
>> +++ b/block/backup-top.c
>> @@ -0,0 +1,322 @@
>> +/*
>> + * backup-top filter driver
>> + *
>> + * The driver performs Copy-Before-Write (CBW) operation: it is injected above
>> + * some node, and before each write it copies _old_ data to the target node.
>> + *
>> + * Copyright (c) 2018 Virtuozzo International GmbH. All rights reserved.
>> + *
>> + * Author:
>> + *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +
>> +#include "qemu/cutils.h"
>> +#include "qapi/error.h"
>> +#include "block/block_int.h"
>> +#include "block/qdict.h"
>> +
>> +#include "block/backup-top.h"
>> +
>> +static coroutine_fn int backup_top_co_preadv(
>> +        BlockDriverState *bs, uint64_t offset, uint64_t bytes,
>> +        QEMUIOVector *qiov, int flags)
>> +{
>> +    /*
>> +     * Features to be implemented:
>> +     * F1. COR. save read data to fleecing target for fast access
>> +     *     (to reduce reads). This possibly may be done with use of copy-on-read
>> +     *     filter, but we need an ability to make COR requests optional: for
>> +     *     example, if target is a ram-cache, and if it is full now, we should
>> +     *     skip doing COR request, as it is actually not necessary.
>> +     *
>> +     * F2. Feature for guest: read from fleecing target if data is in ram-cache
>> +     *     and is unchanged
>> +     */
>> +
>> +    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
>> +}
>> +
>> +static coroutine_fn int backup_top_cbw(BlockDriverState *bs, uint64_t offset,
>> +                                       uint64_t bytes)
>> +{
>> +    int ret = 0;
>> +    BDRVBackupTopState *s = bs->opaque;
>> +    uint64_t gran = 1UL << hbitmap_granularity(s->copy_bitmap);
>> +    uint64_t end = QEMU_ALIGN_UP(offset + bytes, gran);
>> +    uint64_t off = QEMU_ALIGN_DOWN(offset, gran), len;
> 
> The “, len” is a bit weirdly placed there, why not define it on a
> separate line as "uint64_t len = end - off"?
> 
>> +    void *buf = qemu_blockalign(bs, end - off);
>> +
>> +    /*
>> +     * Features to be implemented:
>> +     * F3. parallelize copying loop
>> +     * F4. detect zeroes ? or, otherwise, drop detect zeroes from backup code
>> +     *     and just enable zeroes detecting on target
>> +     * F5. use block_status ?
>> +     * F6. don't copy clusters which are already cached by COR [see F1]
>> +     * F7. if target is ram-cache and it is full, there should be a possibility
>> +     *     to drop not necessary data (cached by COR [see F1]) to handle CBW
>> +     *     fast.
> 
> Also “drop BDRV_REQ_SERIALISING from writes to s->target unless necessary”?

hmm, yes. It may be a filter parameter, serialize writes or not.

> 
>> +     */
>> +
>> +    len = end - off;
>> +    while (hbitmap_next_dirty_area(s->copy_bitmap, &off, &len)) {
>> +        hbitmap_reset(s->copy_bitmap, off, len);
>> +
>> +        ret = bdrv_co_pread(bs->backing, off, len, buf,
>> +                            BDRV_REQ_NO_SERIALISING);
>> +        if (ret < 0) {
>> +            hbitmap_set(s->copy_bitmap, off, len);
>> +            goto out;
>> +        }
>> +
>> +        ret = bdrv_co_pwrite(s->target, off, len, buf, BDRV_REQ_SERIALISING);
>> +        if (ret < 0) {
>> +            hbitmap_set(s->copy_bitmap, off, len);
>> +            goto out;
>> +        }
>> +
>> +        if (s->progress_cb) {
>> +            s->progress_cb(len, s->progress_opaque);
>> +        }
>> +        off += len;
>> +        if (off >= end) {
>> +            break;
>> +        }
>> +        len = end - off;
>> +    }
>> +
>> +out:
>> +    qemu_vfree(buf);
>> +
>> +    /*
>> +     * F8. we fail guest request in case of error. We can alter it by
>> +     * possibility to fail copying process instead, or retry several times, or
>> +     * may be guest pause, etc.
>> +     */
>> +    return ret;
>> +}
>> +
>> +static int coroutine_fn backup_top_co_pdiscard(BlockDriverState *bs,
>> +                                               int64_t offset, int bytes)
>> +{
>> +    int ret = backup_top_cbw(bs, offset, bytes);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    /*
>> +     * Features to be implemented:
>> +     * F9. possibility of lazy discard: just defer the discard after fleecing
>> +     *     completion. If write (or new discard) occurs to the same area, just
>> +     *     drop deferred discard.
>> +     */
>> +
>> +    return bdrv_co_pdiscard(bs->backing, offset, bytes);
>> +}
>> +
>> +static int coroutine_fn backup_top_co_pwrite_zeroes(BlockDriverState *bs,
>> +        int64_t offset, int bytes, BdrvRequestFlags flags)
>> +{
>> +    int ret = backup_top_cbw(bs, offset, bytes);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
>> +}
>> +
>> +static coroutine_fn int backup_top_co_pwritev(BlockDriverState *bs,
>> +                                              uint64_t offset,
>> +                                              uint64_t bytes,
>> +                                              QEMUIOVector *qiov, int flags)
>> +{
>> +    if (!(flags & BDRV_REQ_WRITE_UNCHANGED)) {
>> +        int ret = backup_top_cbw(bs, offset, bytes);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
>> +}
>> +
>> +static int coroutine_fn backup_top_co_flush(BlockDriverState *bs)
>> +{
>> +    if (!bs->backing) {
>> +        return 0;
>> +    }
>> +
>> +    return bdrv_co_flush(bs->backing->bs);
> 
> Should we flush the target, too?

Hm, you've asked it already, on previous version :) Backup don't do it,
so I just keep old behavior. And what is the reason to flush backup target
on any guest flush?

> 
>> +}
>> +
>> +static void backup_top_refresh_filename(BlockDriverState *bs)
>> +{
>> +    if (bs->backing == NULL) {
>> +        /*
>> +         * we can be here after failed bdrv_attach_child in
>> +         * bdrv_set_backing_hd
>> +         */
>> +        return;
>> +    }
>> +    bdrv_refresh_filename(bs->backing->bs);
> 
> bdrv_refresh_filename() should have done this already.
> 
>> +    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
>> +            bs->backing->bs->filename);
>> +}
>> +
>> +static void backup_top_child_perm(BlockDriverState *bs, BdrvChild *c,
>> +                                  const BdrvChildRole *role,
>> +                                  BlockReopenQueue *reopen_queue,
>> +                                  uint64_t perm, uint64_t shared,
>> +                                  uint64_t *nperm, uint64_t *nshared)
>> +{
>> +    /*
>> +     * We have HBitmap in the state, its size is fixed, so we never allow
>> +     * resize.
>> +     */
>> +    uint64_t rw = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
>> +                  BLK_PERM_WRITE;
>> +
>> +    bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
>> +                              nperm, nshared);
>> +
>> +    *nperm = *nperm & rw;
>> +    *nshared = *nshared & rw;
> 
> Somehow, that bdrv_filter_default_perm() call doesn’t make this function
> easier for me.
> 
> It seems like this is basically just “*nperm = perm & rw” and
> “*nshared = shared & rw”.
> 
>> +
>> +    if (role == &child_file) {
>> +        /*
>> +         * Target child
>> +         *
>> +         * Share write to target (child_file), to not interfere
>> +         * with guest writes to its disk which may be in target backing chain.
>> +         */
>> +        if (perm & BLK_PERM_WRITE) {
>> +            *nshared = *nshared | BLK_PERM_WRITE;
> 
> Why not always share WRITE on the target?

Hmm, it's a bad thing to share writes on target, so I'm trying to reduce number of
cases when we have share it.

> 
>> +        }
>> +    } else {
>> +        /* Source child */
>> +        if (perm & BLK_PERM_WRITE) {
> 
> Or WRITE_UNCHANGED, no?

Why? We don't need doing CBW for unchanged write.

> 
>> +            *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
>> +        }
>> +        *nshared =
>> +            *nshared & (BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED);
> 
> And here it isn’t “*nshared = shared & rw” but “rw & ~WRITE”.
> 
> I feel like this function would be simpler if you just set *nperm and
> *nshared separately in the two branches of this condition, without
> setting a “default” first.

Ok, I'll try

> 
>> +    }
>> +}
>> +
>> +BlockDriver bdrv_backup_top_filter = {
>> +    .format_name = "backup-top",
>> +    .instance_size = sizeof(BDRVBackupTopState),
>> +
>> +    .bdrv_co_preadv             = backup_top_co_preadv,
>> +    .bdrv_co_pwritev            = backup_top_co_pwritev,
>> +    .bdrv_co_pwrite_zeroes      = backup_top_co_pwrite_zeroes,
>> +    .bdrv_co_pdiscard           = backup_top_co_pdiscard,
>> +    .bdrv_co_flush              = backup_top_co_flush,
>> +
>> +    .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
>> +
>> +    .bdrv_refresh_filename      = backup_top_refresh_filename,
>> +
>> +    .bdrv_child_perm            = backup_top_child_perm,
>> +
>> +    .is_filter = true,
>> +};
>> +
>> +BlockDriverState *bdrv_backup_top_append(BlockDriverState *source,
>> +                                         BlockDriverState *target,
>> +                                         HBitmap *copy_bitmap,
>> +                                         Error **errp)
>> +{
>> +    Error *local_err = NULL;
>> +    BDRVBackupTopState *state;
>> +    BlockDriverState *top = bdrv_new_open_driver(&bdrv_backup_top_filter,
>> +                                                 NULL, BDRV_O_RDWR, errp);
>> +
>> +    if (!top) {
>> +        return NULL;
>> +    }
>> +
>> +    top->implicit = true;
> 
> As I said above, I think the caller should set this.
> 
>> +    top->total_sectors = source->total_sectors;
>> +    top->bl.opt_mem_alignment = MAX(bdrv_opt_mem_align(source),
>> +                                    bdrv_opt_mem_align(target));
>> +    top->opaque = state = g_new0(BDRVBackupTopState, 1);
>> +    state->copy_bitmap = copy_bitmap;
>> +
>> +    bdrv_ref(target);
>> +    state->target = bdrv_attach_child(top, target, "target", &child_file, errp);
>> +    if (!state->target) {
>> +        bdrv_unref(target);
>> +        bdrv_unref(top);
>> +        return NULL;
>> +    }
>> +
>> +    bdrv_set_aio_context(top, bdrv_get_aio_context(source));
>> +    bdrv_set_aio_context(target, bdrv_get_aio_context(source));
> 
> I suppose these calls aren’t necessary anymore (compare d0ee0204f4009).

Hmm, see it, agree.

>   (I’m not fully sure, though.  In any case, they would need to be
> bdrv_try_set_aio_context() if they were still necessary.)
> 
>> +
>> +    bdrv_drained_begin(source);
>> +
>> +    bdrv_ref(top);
>> +    bdrv_append(top, source, &local_err);
>> +    if (local_err) {
>> +        error_prepend(&local_err, "Cannot append backup-top filter: ");
>> +    }
>> +
>> +    bdrv_drained_end(source);
>> +
>> +    if (local_err) {
>> +        bdrv_unref_child(top, state->target);
>> +        bdrv_unref(top);
>> +        error_propagate(errp, local_err);
>> +        return NULL;
>> +    }
>> +
>> +    return top;
>> +}
> 
> I guess it would be nice if it users could blockdev-add a backup-top
> node to basically get a backup with sync=none.
> 
> (The bdrv_open implementation should then create a new bitmap and
> initialize it to be fully set.)
> 
> But maybe it wouldn’t be so nice and I just have feverish dreams.

When series begun, I was trying to make exactly this - user-available filter,
which may be used in separate, but you was against)

Maybe, not totally against, but I decided not to argue long, and instead make
filter implicit and drop public api (like mirror and commit filters), but place it
in a separate file (no one will argue against putting large enough and complete
new feature, represented by object into a separate file :). And this actually
makes it easy to publish this filter at some moment. And now I think it was
good decision anyway, as we postponed arguing around API and around shared dirty
bitmaps.

So publishing the filter is future step.

> 
>> +void bdrv_backup_top_set_progress_callback(
>> +        BlockDriverState *bs, BackupTopProgressCallback progress_cb,
>> +        void *progress_opaque)
>> +{
>> +    BDRVBackupTopState *s = bs->opaque;
>> +
>> +    s->progress_cb = progress_cb;
>> +    s->progress_opaque = progress_opaque;
>> +}
>> +
>> +void bdrv_backup_top_drop(BlockDriverState *bs)
>> +{
>> +    BDRVBackupTopState *s = bs->opaque;
>> +    AioContext *aio_context = bdrv_get_aio_context(bs);
>> +
>> +    aio_context_acquire(aio_context);
>> +
>> +    bdrv_drained_begin(bs);
>> +
>> +    bdrv_child_try_set_perm(bs->backing, 0, BLK_PERM_ALL, &error_abort);
> 
> Pre-existing in other jobs, but I think calling this function is
> dangerous.  (Which is why I remove it in my “block: Ignore loosening
> perm restrictions failures” series.)

Hmm, good thing.. Should I rebase on it?

> 
>> +    bdrv_replace_node(bs, backing_bs(bs), &error_abort);
>> +    bdrv_set_backing_hd(bs, NULL, &error_abort);
> 
> I think some of this function should be in a .bdrv_close()
> implementation, for example this bdrv_set_backing_hd() call.

Why? We don't have .bdrv_open, so why to have .bdrv_close? I think, when
we publish this filter most of _add() and _drop() will be refactored to
open() and close(). Is there a real reason to implement .close() now?

> 
>> +    bdrv_drained_end(bs);
>> +
>> +    if (s->target) {
>> +        bdrv_unref_child(bs, s->target);
> 
> And this.  Well, neither needs to be in a .bdrv_close() implementation,
> actually, because bdrv_close() will just do it by itself.
> 
> I suppose you’re trying to remove the children so the node is no longer
> usable after this point; but that isn’t quite right, then.  The filter
> functions still refer to s->target and bs->backing, so you need to stop
> them from doing anything then.
> 
> At this point, you might as well add a variable to BDRVBackupTopState
> that says whether the filter is still supposed to be usable (like the
> “stop” variable I added to mirror in “block/mirror: Fix child
> permissions”).  If so, the permission function should signal 0/ALL for
> the permissions on backing (and probably target), and all filter
> functions always immediately return an error.
> 
> So I don’t think backing and target need to be removed here.  We can
> wait for that until bdrv_close(), but we should ensure that the filter
> really is unusable after this function has been called.

Ok, thank you for reviewing!

> 
> Max
> 
>> +    }
>> +    bdrv_unref(bs);
>> +
>> +    aio_context_release(aio_context);
>> +}
>> diff --git a/block/Makefile.objs b/block/Makefile.objs
>> index ae11605c9f..dfbdfe6ab4 100644
>> --- a/block/Makefile.objs
>> +++ b/block/Makefile.objs
>> @@ -40,6 +40,8 @@ block-obj-y += throttle.o copy-on-read.o
>>   
>>   block-obj-y += crypto.o
>>   
>> +block-obj-y += backup-top.o
>> +
>>   common-obj-y += stream.o
>>   
>>   nfs.o-libs         := $(LIBNFS_LIBS)
>>
> 
>
Max Reitz June 14, 2019, 12:57 p.m. UTC | #3
On 14.06.19 11:04, Vladimir Sementsov-Ogievskiy wrote:
> 13.06.2019 18:57, Max Reitz wrote:
>> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
>>> Backup-top filter does copy-before-write operation. It should be
>>> inserted above active disk and has a target node for CBW, like the
>>> following:
>>>
>>>      +-------+
>>>      | Guest |
>>>      +-------+
>>>          |r,w
>>>          v
>>>      +------------+  target   +---------------+
>>>      | backup_top |---------->| target(qcow2) |
>>>      +------------+   CBW     +---------------+
>>>          |
>>> backing |r,w
>>>          v
>>>      +-------------+
>>>      | Active disk |
>>>      +-------------+
>>>
>>> The driver will be used in backup instead of write-notifiers.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>> ---
>>>   block/backup-top.h  |  64 +++++++++
>>>   block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>>>   block/Makefile.objs |   2 +
>>>   3 files changed, 388 insertions(+)
>>>   create mode 100644 block/backup-top.h
>>>   create mode 100644 block/backup-top.c
>>>
>>> diff --git a/block/backup-top.h b/block/backup-top.h
>>> new file mode 100644
>>> index 0000000000..788e18c358
>>> --- /dev/null
>>> +++ b/block/backup-top.h

[...]

>>> +/*
>>> + * bdrv_backup_top_append
>>> + *
>>> + * Append backup_top filter node above @source node. @target node will receive
>>> + * the data backed up during CBE operations. New filter together with @target
>>> + * node are attached to @source aio context.
>>> + *
>>> + * The resulting filter node is implicit.
>>
>> Why?  It’s just as easy for the caller to just make it implicit if it
>> should be.  (And usually the caller should decide that.)
> 
> Personally, I don't know what are the reasons for filters to bi implicit or not,
> I just made it like other job-filters.. I can move making-implicit to the caller
> or drop it at all (if it will work).

Nodes are implicit if they haven’t been added consciously by the user.
A node added by a block job can be non-implicit, too, as mirror shows;
If the user specifies the filter-node-name option, they will know about
the node, thus it is no longer implicit.

If the user doesn’t know about the node (they didn’t give the
filter-node-name option), the node is implicit.

[...]

>>> +static int coroutine_fn backup_top_co_flush(BlockDriverState *bs)
>>> +{
>>> +    if (!bs->backing) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    return bdrv_co_flush(bs->backing->bs);
>>
>> Should we flush the target, too?
> 
> Hm, you've asked it already, on previous version :)

I wasn’t sure...

> Backup don't do it,
> so I just keep old behavior. And what is the reason to flush backup target
> on any guest flush?

Hm, well, what’s the reason not to do it?
Also, there are not only guest flushes.  bdrv_flush_all() exists, which
is called when the guest is stopped.  So who is going to flush the
target if not its parent?

[...]

>>> +
>>> +    if (role == &child_file) {
>>> +        /*
>>> +         * Target child
>>> +         *
>>> +         * Share write to target (child_file), to not interfere
>>> +         * with guest writes to its disk which may be in target backing chain.
>>> +         */
>>> +        if (perm & BLK_PERM_WRITE) {
>>> +            *nshared = *nshared | BLK_PERM_WRITE;
>>
>> Why not always share WRITE on the target?
> 
> Hmm, it's a bad thing to share writes on target, so I'm trying to reduce number of
> cases when we have share it.

Is it?  First of all, this filter doesn’t care.  It doesn’t even read
from the target (related note: we probably don’t need CONSISTENT_READ on
the target).

Second, there is generally going to be a parent on backup-top that has
the WRITE permission taken.  So this does not really effectively reduce
that number of cases.

>>> +        }
>>> +    } else {
>>> +        /* Source child */
>>> +        if (perm & BLK_PERM_WRITE) {
>>
>> Or WRITE_UNCHANGED, no?
> 
> Why? We don't need doing CBW for unchanged write.

But we will do it still, won’t we?

(If an unchanging write comes in, this driver will handle it just like a
normal write, will it not?)

[...]

>>> +
>>> +BlockDriverState *bdrv_backup_top_append(BlockDriverState *source,
>>> +                                         BlockDriverState *target,
>>> +                                         HBitmap *copy_bitmap,
>>> +                                         Error **errp)
>>> +{

[...]

>>> +}
>>
>> I guess it would be nice if it users could blockdev-add a backup-top
>> node to basically get a backup with sync=none.
>>
>> (The bdrv_open implementation should then create a new bitmap and
>> initialize it to be fully set.)
>>
>> But maybe it wouldn’t be so nice and I just have feverish dreams.
> 
> When series begun, I was trying to make exactly this - user-available filter,
> which may be used in separate, but you was against)

Past me is stupid.

> Maybe, not totally against, but I decided not to argue long, and instead make
> filter implicit and drop public api (like mirror and commit filters), but place it
> in a separate file (no one will argue against putting large enough and complete
> new feature, represented by object into a separate file :). And this actually
> makes it easy to publish this filter at some moment. And now I think it was
> good decision anyway, as we postponed arguing around API and around shared dirty
> bitmaps.
> 
> So publishing the filter is future step.

OK, sure.

>>> +void bdrv_backup_top_set_progress_callback(
>>> +        BlockDriverState *bs, BackupTopProgressCallback progress_cb,
>>> +        void *progress_opaque)
>>> +{
>>> +    BDRVBackupTopState *s = bs->opaque;
>>> +
>>> +    s->progress_cb = progress_cb;
>>> +    s->progress_opaque = progress_opaque;
>>> +}
>>> +
>>> +void bdrv_backup_top_drop(BlockDriverState *bs)
>>> +{
>>> +    BDRVBackupTopState *s = bs->opaque;
>>> +    AioContext *aio_context = bdrv_get_aio_context(bs);
>>> +
>>> +    aio_context_acquire(aio_context);
>>> +
>>> +    bdrv_drained_begin(bs);
>>> +
>>> +    bdrv_child_try_set_perm(bs->backing, 0, BLK_PERM_ALL, &error_abort);
>>
>> Pre-existing in other jobs, but I think calling this function is
>> dangerous.  (Which is why I remove it in my “block: Ignore loosening
>> perm restrictions failures” series.)
> 
> Hmm, good thing.. Should I rebase on it?

It would help me at least.

>>> +    bdrv_replace_node(bs, backing_bs(bs), &error_abort);
>>> +    bdrv_set_backing_hd(bs, NULL, &error_abort);
>>
>> I think some of this function should be in a .bdrv_close()
>> implementation, for example this bdrv_set_backing_hd() call.
> 
> Why? We don't have .bdrv_open, so why to have .bdrv_close? I think, when
> we publish this filter most of _add() and _drop() will be refactored to
> open() and close(). Is there a real reason to implement .close() now?

Not really if it isn’t a usable block driver yet, no.

Max
Vladimir Sementsov-Ogievskiy June 14, 2019, 4:22 p.m. UTC | #4
14.06.2019 15:57, Max Reitz wrote:
> On 14.06.19 11:04, Vladimir Sementsov-Ogievskiy wrote:
>> 13.06.2019 18:57, Max Reitz wrote:
>>> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
>>>> Backup-top filter does copy-before-write operation. It should be
>>>> inserted above active disk and has a target node for CBW, like the
>>>> following:
>>>>
>>>>       +-------+
>>>>       | Guest |
>>>>       +-------+
>>>>           |r,w
>>>>           v
>>>>       +------------+  target   +---------------+
>>>>       | backup_top |---------->| target(qcow2) |
>>>>       +------------+   CBW     +---------------+
>>>>           |
>>>> backing |r,w
>>>>           v
>>>>       +-------------+
>>>>       | Active disk |
>>>>       +-------------+
>>>>
>>>> The driver will be used in backup instead of write-notifiers.
>>>>
>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>> ---
>>>>    block/backup-top.h  |  64 +++++++++
>>>>    block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>>>>    block/Makefile.objs |   2 +
>>>>    3 files changed, 388 insertions(+)
>>>>    create mode 100644 block/backup-top.h
>>>>    create mode 100644 block/backup-top.c
>>>>
>>>> diff --git a/block/backup-top.h b/block/backup-top.h
>>>> new file mode 100644
>>>> index 0000000000..788e18c358
>>>> --- /dev/null
>>>> +++ b/block/backup-top.h
> 
> [...]
> 
>>>> +/*
>>>> + * bdrv_backup_top_append
>>>> + *
>>>> + * Append backup_top filter node above @source node. @target node will receive
>>>> + * the data backed up during CBE operations. New filter together with @target
>>>> + * node are attached to @source aio context.
>>>> + *
>>>> + * The resulting filter node is implicit.
>>>
>>> Why?  It’s just as easy for the caller to just make it implicit if it
>>> should be.  (And usually the caller should decide that.)
>>
>> Personally, I don't know what are the reasons for filters to bi implicit or not,
>> I just made it like other job-filters.. I can move making-implicit to the caller
>> or drop it at all (if it will work).
> 
> Nodes are implicit if they haven’t been added consciously by the user.
> A node added by a block job can be non-implicit, too, as mirror shows;
> If the user specifies the filter-node-name option, they will know about
> the node, thus it is no longer implicit.
> 
> If the user doesn’t know about the node (they didn’t give the
> filter-node-name option), the node is implicit.
> 

Ok, I understand it. But it doesn't show, why it should be implicit?
Isn't it less buggy to make all filters explicit? We don't hide implicit nodes
from query-named-block-nodes (the only interface to explore the whole graph for now)
anyway. And we can't absolutely hide side effects of additional node in the graph.

So, is there any real benefit of supporting separation into implicit and explicit filters?
It seems for me that it only complicates things...
In other words, what will break if we make all filters explicit?

> [...]
> 
>>>> +static int coroutine_fn backup_top_co_flush(BlockDriverState *bs)
>>>> +{
>>>> +    if (!bs->backing) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    return bdrv_co_flush(bs->backing->bs);
>>>
>>> Should we flush the target, too?
>>
>> Hm, you've asked it already, on previous version :)
> 
> I wasn’t sure...
> 
>> Backup don't do it,
>> so I just keep old behavior. And what is the reason to flush backup target
>> on any guest flush?
> 
> Hm, well, what’s the reason not to do it?

guest flushes will be slowed down?

> Also, there are not only guest flushes.  bdrv_flush_all() exists, which
> is called when the guest is stopped.  So who is going to flush the
> target if not its parent?
> 
> [...]

Backup job? But filter should not relay on it..

But should really filter do that job, or it is here only to do CBW? Maybe target
must have another parent which will care about flushing.

Ok, I think I'll flush target here too for safety, and leave a comment, that something
smarter would be better.
(or, if we decide to flush all children by default in generic code, I'll drop this handler)

> 
>>>> +
>>>> +    if (role == &child_file) {
>>>> +        /*
>>>> +         * Target child
>>>> +         *
>>>> +         * Share write to target (child_file), to not interfere
>>>> +         * with guest writes to its disk which may be in target backing chain.
>>>> +         */
>>>> +        if (perm & BLK_PERM_WRITE) {
>>>> +            *nshared = *nshared | BLK_PERM_WRITE;
>>>
>>> Why not always share WRITE on the target?
>>
>> Hmm, it's a bad thing to share writes on target, so I'm trying to reduce number of
>> cases when we have share it.
> 
> Is it?  First of all, this filter doesn’t care.  It doesn’t even read
> from the target (related note: we probably don’t need CONSISTENT_READ on
> the target).
> 
> Second, there is generally going to be a parent on backup-top that has
> the WRITE permission taken.  So this does not really effectively reduce
> that number of cases.

Ok.

> 
>>>> +        }
>>>> +    } else {
>>>> +        /* Source child */
>>>> +        if (perm & BLK_PERM_WRITE) {
>>>
>>> Or WRITE_UNCHANGED, no?
>>
>> Why? We don't need doing CBW for unchanged write.
> 
> But we will do it still, won’t we?
> 
> (If an unchanging write comes in, this driver will handle it just like a
> normal write, will it not?)

No, it will not, I check this flag in backup_top_co_pwritev

> 
> [...]
> 
>>>> +
>>>> +BlockDriverState *bdrv_backup_top_append(BlockDriverState *source,
>>>> +                                         BlockDriverState *target,
>>>> +                                         HBitmap *copy_bitmap,
>>>> +                                         Error **errp)
>>>> +{
> 
> [...]
> 
>>>> +}
>>>
>>> I guess it would be nice if it users could blockdev-add a backup-top
>>> node to basically get a backup with sync=none.
>>>
>>> (The bdrv_open implementation should then create a new bitmap and
>>> initialize it to be fully set.)
>>>
>>> But maybe it wouldn’t be so nice and I just have feverish dreams.
>>
>> When series begun, I was trying to make exactly this - user-available filter,
>> which may be used in separate, but you was against)
> 
> Past me is stupid.
> 
>> Maybe, not totally against, but I decided not to argue long, and instead make
>> filter implicit and drop public api (like mirror and commit filters), but place it
>> in a separate file (no one will argue against putting large enough and complete
>> new feature, represented by object into a separate file :). And this actually
>> makes it easy to publish this filter at some moment. And now I think it was
>> good decision anyway, as we postponed arguing around API and around shared dirty
>> bitmaps.
>>
>> So publishing the filter is future step.
> 
> OK, sure.
> 
>>>> +void bdrv_backup_top_set_progress_callback(
>>>> +        BlockDriverState *bs, BackupTopProgressCallback progress_cb,
>>>> +        void *progress_opaque)
>>>> +{
>>>> +    BDRVBackupTopState *s = bs->opaque;
>>>> +
>>>> +    s->progress_cb = progress_cb;
>>>> +    s->progress_opaque = progress_opaque;
>>>> +}
>>>> +
>>>> +void bdrv_backup_top_drop(BlockDriverState *bs)
>>>> +{
>>>> +    BDRVBackupTopState *s = bs->opaque;
>>>> +    AioContext *aio_context = bdrv_get_aio_context(bs);
>>>> +
>>>> +    aio_context_acquire(aio_context);
>>>> +
>>>> +    bdrv_drained_begin(bs);
>>>> +
>>>> +    bdrv_child_try_set_perm(bs->backing, 0, BLK_PERM_ALL, &error_abort);
>>>
>>> Pre-existing in other jobs, but I think calling this function is
>>> dangerous.  (Which is why I remove it in my “block: Ignore loosening
>>> perm restrictions failures” series.)
>>
>> Hmm, good thing.. Should I rebase on it?
> 
> It would help me at least.

And now it's already staged, so I should.

> 
>>>> +    bdrv_replace_node(bs, backing_bs(bs), &error_abort);
>>>> +    bdrv_set_backing_hd(bs, NULL, &error_abort);
>>>
>>> I think some of this function should be in a .bdrv_close()
>>> implementation, for example this bdrv_set_backing_hd() call.
>>
>> Why? We don't have .bdrv_open, so why to have .bdrv_close? I think, when
>> we publish this filter most of _add() and _drop() will be refactored to
>> open() and close(). Is there a real reason to implement .close() now?
> 
> Not really if it isn’t a usable block driver yet, no.
> 
> Max
>
Max Reitz June 14, 2019, 8:03 p.m. UTC | #5
On 14.06.19 18:22, Vladimir Sementsov-Ogievskiy wrote:
> 14.06.2019 15:57, Max Reitz wrote:
>> On 14.06.19 11:04, Vladimir Sementsov-Ogievskiy wrote:
>>> 13.06.2019 18:57, Max Reitz wrote:
>>>> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
>>>>> Backup-top filter does copy-before-write operation. It should be
>>>>> inserted above active disk and has a target node for CBW, like the
>>>>> following:
>>>>>
>>>>>       +-------+
>>>>>       | Guest |
>>>>>       +-------+
>>>>>           |r,w
>>>>>           v
>>>>>       +------------+  target   +---------------+
>>>>>       | backup_top |---------->| target(qcow2) |
>>>>>       +------------+   CBW     +---------------+
>>>>>           |
>>>>> backing |r,w
>>>>>           v
>>>>>       +-------------+
>>>>>       | Active disk |
>>>>>       +-------------+
>>>>>
>>>>> The driver will be used in backup instead of write-notifiers.
>>>>>
>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>> ---
>>>>>    block/backup-top.h  |  64 +++++++++
>>>>>    block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>    block/Makefile.objs |   2 +
>>>>>    3 files changed, 388 insertions(+)
>>>>>    create mode 100644 block/backup-top.h
>>>>>    create mode 100644 block/backup-top.c
>>>>>
>>>>> diff --git a/block/backup-top.h b/block/backup-top.h
>>>>> new file mode 100644
>>>>> index 0000000000..788e18c358
>>>>> --- /dev/null
>>>>> +++ b/block/backup-top.h
>>
>> [...]
>>
>>>>> +/*
>>>>> + * bdrv_backup_top_append
>>>>> + *
>>>>> + * Append backup_top filter node above @source node. @target node will receive
>>>>> + * the data backed up during CBE operations. New filter together with @target
>>>>> + * node are attached to @source aio context.
>>>>> + *
>>>>> + * The resulting filter node is implicit.
>>>>
>>>> Why?  It’s just as easy for the caller to just make it implicit if it
>>>> should be.  (And usually the caller should decide that.)
>>>
>>> Personally, I don't know what are the reasons for filters to bi implicit or not,
>>> I just made it like other job-filters.. I can move making-implicit to the caller
>>> or drop it at all (if it will work).
>>
>> Nodes are implicit if they haven’t been added consciously by the user.
>> A node added by a block job can be non-implicit, too, as mirror shows;
>> If the user specifies the filter-node-name option, they will know about
>> the node, thus it is no longer implicit.
>>
>> If the user doesn’t know about the node (they didn’t give the
>> filter-node-name option), the node is implicit.
>>
> 
> Ok, I understand it. But it doesn't show, why it should be implicit?
> Isn't it less buggy to make all filters explicit? We don't hide implicit nodes
> from query-named-block-nodes (the only interface to explore the whole graph for now)
> anyway. And we can't absolutely hide side effects of additional node in the graph.

Well, we try, at least.  At least we hide them from query-block.

> So, is there any real benefit of supporting separation into implicit and explicit filters?
> It seems for me that it only complicates things...
> In other words, what will break if we make all filters explicit?

The theory is that qemu may decide to add nodes at any point, but at
least when managing chains etc., they may not be visible to the user.  I
don’t think we can get rid of them so easily.

One example that isn’t implemented yet is copy-on-read.  In theory,
specifying copy-on-read=on for -drive should create an implicit COR node
on top.  The user shouldn’t see that node when inspecting the drive or
when issuing block jobs on it, etc.  And this node has to stay there
when the user does e.g. an active commit somewhere down the chain.

That sounds like a horrible ordeal to implement, so it hasn’t been done
yet.  Maybe it never will.  It isn’t that bad for the job filters,
because they generally freeze the block graph, so there is no problem
with potential modifications.

All in all I do think having implicit nodes makes sense.  Maybe not so
much now, but in the future (if someone implements converting -drive COR
and throttle options to implicit nodes...).

>> [...]
>>
>>>>> +static int coroutine_fn backup_top_co_flush(BlockDriverState *bs)
>>>>> +{
>>>>> +    if (!bs->backing) {
>>>>> +        return 0;
>>>>> +    }
>>>>> +
>>>>> +    return bdrv_co_flush(bs->backing->bs);
>>>>
>>>> Should we flush the target, too?
>>>
>>> Hm, you've asked it already, on previous version :)
>>
>> I wasn’t sure...
>>
>>> Backup don't do it,
>>> so I just keep old behavior. And what is the reason to flush backup target
>>> on any guest flush?
>>
>> Hm, well, what’s the reason not to do it?
> 
> guest flushes will be slowed down?

Hm, the user could specify cache=unsafe if they don’t care.  Which gives
me second thoughs... [1]

>> Also, there are not only guest flushes.  bdrv_flush_all() exists, which
>> is called when the guest is stopped.  So who is going to flush the
>> target if not its parent?
>>
>> [...]
> 
> Backup job? But filter should not relay on it..

Hm.  Backup job actually doesn’t sound that wrong.

> But should really filter do that job, or it is here only to do CBW? Maybe target
> must have another parent which will care about flushing.
> 
> Ok, I think I'll flush target here too for safety, and leave a comment, that something
> smarter would be better.
> (or, if we decide to flush all children by default in generic code, I'll drop this handler)

[1] Thinking more about this, for normal backups the target file does
not reflect a valid disk state until the backup is done; just like for
qemu-img convert.  Just like qemu-img convert, there is therefore no
reason to flush the target until the job is done.

But there is also the other case of image fleecing.  In this case, the
target will have another parent, so bdrv_flush_all() will be done by
someone.  Still, it intuitively makes sense to me that in this case, the
backup-top filter should flush the target if the guest decides to flush
the device (so that the target stays consistent on disk).

backup-top currently cannot differentiate between the cases, but maybe
that is generally a useful hint to give to it?  (That is, whether the
target has a consistent state or not.)

[...]

>>>>> +        }
>>>>> +    } else {
>>>>> +        /* Source child */
>>>>> +        if (perm & BLK_PERM_WRITE) {
>>>>
>>>> Or WRITE_UNCHANGED, no?
>>>
>>> Why? We don't need doing CBW for unchanged write.
>>
>> But we will do it still, won’t we?
>>
>> (If an unchanging write comes in, this driver will handle it just like a
>> normal write, will it not?)
> 
> No, it will not, I check this flag in backup_top_co_pwritev

Oops.  My bad.

Max
Vladimir Sementsov-Ogievskiy June 17, 2019, 10:36 a.m. UTC | #6
14.06.2019 23:03, Max Reitz wrote:
> On 14.06.19 18:22, Vladimir Sementsov-Ogievskiy wrote:
>> 14.06.2019 15:57, Max Reitz wrote:
>>> On 14.06.19 11:04, Vladimir Sementsov-Ogievskiy wrote:
>>>> 13.06.2019 18:57, Max Reitz wrote:
>>>>> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> Backup-top filter does copy-before-write operation. It should be
>>>>>> inserted above active disk and has a target node for CBW, like the
>>>>>> following:
>>>>>>
>>>>>>        +-------+
>>>>>>        | Guest |
>>>>>>        +-------+
>>>>>>            |r,w
>>>>>>            v
>>>>>>        +------------+  target   +---------------+
>>>>>>        | backup_top |---------->| target(qcow2) |
>>>>>>        +------------+   CBW     +---------------+
>>>>>>            |
>>>>>> backing |r,w
>>>>>>            v
>>>>>>        +-------------+
>>>>>>        | Active disk |
>>>>>>        +-------------+
>>>>>>
>>>>>> The driver will be used in backup instead of write-notifiers.
>>>>>>
>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>> ---
>>>>>>     block/backup-top.h  |  64 +++++++++
>>>>>>     block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>     block/Makefile.objs |   2 +
>>>>>>     3 files changed, 388 insertions(+)
>>>>>>     create mode 100644 block/backup-top.h
>>>>>>     create mode 100644 block/backup-top.c
>>>>>>
>>>>>> diff --git a/block/backup-top.h b/block/backup-top.h
>>>>>> new file mode 100644
>>>>>> index 0000000000..788e18c358
>>>>>> --- /dev/null
>>>>>> +++ b/block/backup-top.h
>>>
>>> [...]
>>>
>>>>>> +/*
>>>>>> + * bdrv_backup_top_append
>>>>>> + *
>>>>>> + * Append backup_top filter node above @source node. @target node will receive
>>>>>> + * the data backed up during CBE operations. New filter together with @target
>>>>>> + * node are attached to @source aio context.
>>>>>> + *
>>>>>> + * The resulting filter node is implicit.
>>>>>
>>>>> Why?  It’s just as easy for the caller to just make it implicit if it
>>>>> should be.  (And usually the caller should decide that.)
>>>>
>>>> Personally, I don't know what are the reasons for filters to bi implicit or not,
>>>> I just made it like other job-filters.. I can move making-implicit to the caller
>>>> or drop it at all (if it will work).
>>>
>>> Nodes are implicit if they haven’t been added consciously by the user.
>>> A node added by a block job can be non-implicit, too, as mirror shows;
>>> If the user specifies the filter-node-name option, they will know about
>>> the node, thus it is no longer implicit.
>>>
>>> If the user doesn’t know about the node (they didn’t give the
>>> filter-node-name option), the node is implicit.
>>>
>>
>> Ok, I understand it. But it doesn't show, why it should be implicit?
>> Isn't it less buggy to make all filters explicit? We don't hide implicit nodes
>> from query-named-block-nodes (the only interface to explore the whole graph for now)
>> anyway. And we can't absolutely hide side effects of additional node in the graph.
> 
> Well, we try, at least.  At least we hide them from query-block.
> 
>> So, is there any real benefit of supporting separation into implicit and explicit filters?
>> It seems for me that it only complicates things...
>> In other words, what will break if we make all filters explicit?
> 
> The theory is that qemu may decide to add nodes at any point, but at
> least when managing chains etc., they may not be visible to the user.  I
> don’t think we can get rid of them so easily.
> 
> One example that isn’t implemented yet is copy-on-read.  In theory,
> specifying copy-on-read=on for -drive should create an implicit COR node
> on top.  The user shouldn’t see that node when inspecting the drive or
> when issuing block jobs on it, etc.  And this node has to stay there
> when the user does e.g. an active commit somewhere down the chain.

Why instead not to just write in doc that yes, filter is created when you
enable copy-on-read? And do same for all operations which may create filter,
we don't have a lot of them? And add optional parameter to set node-name for
created filter.

> 
> That sounds like a horrible ordeal to implement, so it hasn’t been done
> yet.  Maybe it never will.  It isn’t that bad for the job filters,
> because they generally freeze the block graph, so there is no problem
> with potential modifications.
> 
> All in all I do think having implicit nodes makes sense.  Maybe not so
> much now, but in the future (if someone implements converting -drive COR
> and throttle options to implicit nodes...).

But do we have at least strong definition of how implicit nodes should behave
on any graph-modifying operations around them? Should new implicit/explicit
filters be created above or under them? We should specify it in doc about all
such operations, otherwise effect of implicit nodes may change unpredictably for
the user. I'm afraid, that implicit filters will just continue my list of
features which should not be used if we want to be able to maintain all this mess..

1. If you something around filenames don't work, use node-names, and better
don't use filename-based APIs

2. If you qemu-img can't do what you need, use paused qemu process, and better,
don't use qemu-img

and a new one:

3. If something is broken around jobs and filters and any block operations,
set filter-node-name in parameters to make filter node explicit. And better,
always set it..

So at least, I think we should make it possible for all filters to be explicit
if user wants, by setting its name like in mirror.

(and then, I will not really care of implicit-filters related logic, like I don't
really care about filename-api related logic)

Also, now, implict filters are actually available for direct manipulation by user,
as their node-names are exported in query-named-block-nodes and nothing prevents
using these names in differet APIs.

> 
>>> [...]
>>>
>>>>>> +static int coroutine_fn backup_top_co_flush(BlockDriverState *bs)
>>>>>> +{
>>>>>> +    if (!bs->backing) {
>>>>>> +        return 0;
>>>>>> +    }
>>>>>> +
>>>>>> +    return bdrv_co_flush(bs->backing->bs);
>>>>>
>>>>> Should we flush the target, too?
>>>>
>>>> Hm, you've asked it already, on previous version :)
>>>
>>> I wasn’t sure...
>>>
>>>> Backup don't do it,
>>>> so I just keep old behavior. And what is the reason to flush backup target
>>>> on any guest flush?
>>>
>>> Hm, well, what’s the reason not to do it?
>>
>> guest flushes will be slowed down?
> 
> Hm, the user could specify cache=unsafe if they don’t care.  Which gives
> me second thoughs... [1]
> 
>>> Also, there are not only guest flushes.  bdrv_flush_all() exists, which
>>> is called when the guest is stopped.  So who is going to flush the
>>> target if not its parent?
>>>
>>> [...]
>>
>> Backup job? But filter should not relay on it..
> 
> Hm.  Backup job actually doesn’t sound that wrong.
> 
>> But should really filter do that job, or it is here only to do CBW? Maybe target
>> must have another parent which will care about flushing.
>>
>> Ok, I think I'll flush target here too for safety, and leave a comment, that something
>> smarter would be better.
>> (or, if we decide to flush all children by default in generic code, I'll drop this handler)
> 
> [1] Thinking more about this, for normal backups the target file does
> not reflect a valid disk state until the backup is done; just like for
> qemu-img convert.  Just like qemu-img convert, there is therefore no
> reason to flush the target until the job is done.
> 
> But there is also the other case of image fleecing.  In this case, the
> target will have another parent, so bdrv_flush_all() will be done by
> someone.  Still, it intuitively makes sense to me that in this case, the
> backup-top filter should flush the target if the guest decides to flush
> the device (so that the target stays consistent on disk).
> 
> backup-top currently cannot differentiate between the cases, but maybe
> that is generally a useful hint to give to it?  (That is, whether the
> target has a consistent state or not.)

Hmm, for fleecing we don't need consistent state of temporary image. If something fails,
the whole operation is considered to be failed. And anyway, I don't see relations
between consistency of temporary fleecing image and guest flushes, why should we
bind them?

> 
> [...]
> 
>>>>>> +        }
>>>>>> +    } else {
>>>>>> +        /* Source child */
>>>>>> +        if (perm & BLK_PERM_WRITE) {
>>>>>
>>>>> Or WRITE_UNCHANGED, no?
>>>>
>>>> Why? We don't need doing CBW for unchanged write.
>>>
>>> But we will do it still, won’t we?
>>>
>>> (If an unchanging write comes in, this driver will handle it just like a
>>> normal write, will it not?)
>>
>> No, it will not, I check this flag in backup_top_co_pwritev
> 
> Oops.  My bad.
> 
> Max
>
Max Reitz June 17, 2019, 2:56 p.m. UTC | #7
On 17.06.19 12:36, Vladimir Sementsov-Ogievskiy wrote:
> 14.06.2019 23:03, Max Reitz wrote:
>> On 14.06.19 18:22, Vladimir Sementsov-Ogievskiy wrote:
>>> 14.06.2019 15:57, Max Reitz wrote:
>>>> On 14.06.19 11:04, Vladimir Sementsov-Ogievskiy wrote:
>>>>> 13.06.2019 18:57, Max Reitz wrote:
>>>>>> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>> Backup-top filter does copy-before-write operation. It should be
>>>>>>> inserted above active disk and has a target node for CBW, like the
>>>>>>> following:
>>>>>>>
>>>>>>>        +-------+
>>>>>>>        | Guest |
>>>>>>>        +-------+
>>>>>>>            |r,w
>>>>>>>            v
>>>>>>>        +------------+  target   +---------------+
>>>>>>>        | backup_top |---------->| target(qcow2) |
>>>>>>>        +------------+   CBW     +---------------+
>>>>>>>            |
>>>>>>> backing |r,w
>>>>>>>            v
>>>>>>>        +-------------+
>>>>>>>        | Active disk |
>>>>>>>        +-------------+
>>>>>>>
>>>>>>> The driver will be used in backup instead of write-notifiers.
>>>>>>>
>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>>> ---
>>>>>>>     block/backup-top.h  |  64 +++++++++
>>>>>>>     block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>     block/Makefile.objs |   2 +
>>>>>>>     3 files changed, 388 insertions(+)
>>>>>>>     create mode 100644 block/backup-top.h
>>>>>>>     create mode 100644 block/backup-top.c
>>>>>>>
>>>>>>> diff --git a/block/backup-top.h b/block/backup-top.h
>>>>>>> new file mode 100644
>>>>>>> index 0000000000..788e18c358
>>>>>>> --- /dev/null
>>>>>>> +++ b/block/backup-top.h
>>>>
>>>> [...]
>>>>
>>>>>>> +/*
>>>>>>> + * bdrv_backup_top_append
>>>>>>> + *
>>>>>>> + * Append backup_top filter node above @source node. @target node will receive
>>>>>>> + * the data backed up during CBE operations. New filter together with @target
>>>>>>> + * node are attached to @source aio context.
>>>>>>> + *
>>>>>>> + * The resulting filter node is implicit.
>>>>>>
>>>>>> Why?  It’s just as easy for the caller to just make it implicit if it
>>>>>> should be.  (And usually the caller should decide that.)
>>>>>
>>>>> Personally, I don't know what are the reasons for filters to bi implicit or not,
>>>>> I just made it like other job-filters.. I can move making-implicit to the caller
>>>>> or drop it at all (if it will work).
>>>>
>>>> Nodes are implicit if they haven’t been added consciously by the user.
>>>> A node added by a block job can be non-implicit, too, as mirror shows;
>>>> If the user specifies the filter-node-name option, they will know about
>>>> the node, thus it is no longer implicit.
>>>>
>>>> If the user doesn’t know about the node (they didn’t give the
>>>> filter-node-name option), the node is implicit.
>>>>
>>>
>>> Ok, I understand it. But it doesn't show, why it should be implicit?
>>> Isn't it less buggy to make all filters explicit? We don't hide implicit nodes
>>> from query-named-block-nodes (the only interface to explore the whole graph for now)
>>> anyway. And we can't absolutely hide side effects of additional node in the graph.
>>
>> Well, we try, at least.  At least we hide them from query-block.
>>
>>> So, is there any real benefit of supporting separation into implicit and explicit filters?
>>> It seems for me that it only complicates things...
>>> In other words, what will break if we make all filters explicit?
>>
>> The theory is that qemu may decide to add nodes at any point, but at
>> least when managing chains etc., they may not be visible to the user.  I
>> don’t think we can get rid of them so easily.
>>
>> One example that isn’t implemented yet is copy-on-read.  In theory,
>> specifying copy-on-read=on for -drive should create an implicit COR node
>> on top.  The user shouldn’t see that node when inspecting the drive or
>> when issuing block jobs on it, etc.  And this node has to stay there
>> when the user does e.g. an active commit somewhere down the chain.
> 
> Why instead not to just write in doc that yes, filter is created when you
> enable copy-on-read?

Because the problem is that existing users are not aware of that.

If they were, they could simply create a COR filter already.

I suppose we could interpret the deprecation policy in a way that we
could change the behavior of -drive copy-on-read=on, but as I already
said, what’s the point of supporting -drive copy-on-read=on, when you
can simply explicitly create a COR filter on top?

> And do same for all operations which may create filter,
> we don't have a lot of them? And add optional parameter to set node-name for
> created filter.

That optional parameter exists, and if it is given, the user shows that
they are aware of the node.  Hence the node becomes explicit then.

The problem is with legacy users that use an existing interface and
suddenly the externally visible interface of qemu changes in a way that
could break them.  I suppose additional entries in
query-named-block-nodes is not breakage.  Maybe it is, that would be a
bug then.

(If nobody has noticed so far, that may be because these legacy
applications didn’t use query-named-block-nodes.  But now maybe libvirt
does, and so if we converted copy-on-read=on to a COR node, it would
notice now and thus break.  So just because our handling of implicit
nodes is broken right now does not mean we can abandon the concept
altogether.)

>> That sounds like a horrible ordeal to implement, so it hasn’t been done
>> yet.  Maybe it never will.  It isn’t that bad for the job filters,
>> because they generally freeze the block graph, so there is no problem
>> with potential modifications.
>>
>> All in all I do think having implicit nodes makes sense.  Maybe not so
>> much now, but in the future (if someone implements converting -drive COR
>> and throttle options to implicit nodes...).
> 
> But do we have at least strong definition of how implicit nodes should behave
> on any graph-modifying operations around them?

No.  Depends on the node.

> Should new implicit/explicit
> filters be created above or under them?

That was always the most difficult question we had when we introduced
filters.

The problem is that we never answered it in our code base.

One day, we just added filters (“let’s see what happens”), and in the
beginning, they were all explicit, so we still didn’t have to answer
this question (in code).  Then job filters were added, and we still
didn’t have to, because they set blockers so the graph couldn’t be
reorganized with them in place anyway.

If we converted copy-on-read=on to a COR node, we would have to answer
that question.

> We should specify it in doc about all
> such operations, otherwise effect of implicit nodes may change unpredictably for
> the user. I'm afraid, that implicit filters will just continue my list of
> features which should not be used if we want to be able to maintain all this mess..

Yeah, well, that is nothing new at all.  For example, note the “Split
I/O throttling off into own BDS – Requires some care with snapshots
etc.” here:
https://wiki.qemu.org/ToDo/Block#Dynamic_graph_reconfiguration_.28e.g._adding_block_filters.2C_taking_snapshots.2C_etc..29

This was always a problem.  It was always the biggest problem with
filters.  We never answered it because it never become an acute problem,
as I described above.

We just said “For explicit filters, they just stay where they are, and
that’s OK because the user can take care of them.”  We never did add the
implicit filters that were actually going to become a problem (COR and
throttle).

So as I said above, the behavior of implicit nodes needs to be examined
on a case-by-case basis.  Well, actually, this is only about COR and
throttle, really, and both would behave the same: Always stay at the
BlockBackend, because that is the behavior when you currently use
BB-level throttling or COR.

The jobs that now use implicit nodes freeze the graph, so there is no
problem there.

(Also, by the way, we do not need the option to give COR/throttle nodes
created through BB options a node name.  As I said above, at that point
you can just create the node yourself.)

> 1. If you something around filenames don't work, use node-names, and better
> don't use filename-based APIs

I think we agree that filename-based APIs were a mistake.  (Though only
in hindsight.  They were added when there where no node names, so it did
make sense then.)

So now you should not use them, correct.

> 2. If you qemu-img can't do what you need, use paused qemu process, and better,
> don't use qemu-img

There was always a plan to give qemu-img feature parity to qemu proper.
 But often the interface is just a problem.  Node names really don’t
make much sense with qemu-img, I think.

(I mean, feel free to add something that automatically names the backing
nodes e.g. backing.0, backing.1, etc., and then add an interface to
qemu-img commit to use node names, but I find that weird.)

So I don’t quite see the problem.  Of course qemu has more functionality
than qemu-img.  qemu-img is fine as long as you have a simple set-up,
but once it gets more complicated, you need management software; and
that software shouldn’t care much whether it uses qemu-img or qemu itself.

> and a new one:
> 
> 3. If something is broken around jobs and filters and any block operations,
> set filter-node-name in parameters to make filter node explicit. And better,
> always set it..

Uh, yeah?  I don’t see the problem here.

This is like saying “If I want to use new features, I should use
-blockdev and not -hda”.  Yes, that is correct.  You should set
filter-node-name if you care about the exact state of the graph.

> So at least, I think we should make it possible for all filters to be explicit
> if user wants, by setting its name like in mirror.

For all filters that the user has no other way of creating them, yes.
(That is, the filters created by the block jobs.)

> (and then, I will not really care of implicit-filters related logic, like I don't
> really care about filename-api related logic)
> 
> Also, now, implict filters are actually available for direct manipulation by user,
> as their node-names are exported in query-named-block-nodes and nothing prevents
> using these names in differet APIs.

Hm, yes.  Is that a problem?

>>>> [...]
>>>>
>>>>>>> +static int coroutine_fn backup_top_co_flush(BlockDriverState *bs)
>>>>>>> +{
>>>>>>> +    if (!bs->backing) {
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    return bdrv_co_flush(bs->backing->bs);
>>>>>>
>>>>>> Should we flush the target, too?
>>>>>
>>>>> Hm, you've asked it already, on previous version :)
>>>>
>>>> I wasn’t sure...
>>>>
>>>>> Backup don't do it,
>>>>> so I just keep old behavior. And what is the reason to flush backup target
>>>>> on any guest flush?
>>>>
>>>> Hm, well, what’s the reason not to do it?
>>>
>>> guest flushes will be slowed down?
>>
>> Hm, the user could specify cache=unsafe if they don’t care.  Which gives
>> me second thoughs... [1]
>>
>>>> Also, there are not only guest flushes.  bdrv_flush_all() exists, which
>>>> is called when the guest is stopped.  So who is going to flush the
>>>> target if not its parent?
>>>>
>>>> [...]
>>>
>>> Backup job? But filter should not relay on it..
>>
>> Hm.  Backup job actually doesn’t sound that wrong.
>>
>>> But should really filter do that job, or it is here only to do CBW? Maybe target
>>> must have another parent which will care about flushing.
>>>
>>> Ok, I think I'll flush target here too for safety, and leave a comment, that something
>>> smarter would be better.
>>> (or, if we decide to flush all children by default in generic code, I'll drop this handler)
>>
>> [1] Thinking more about this, for normal backups the target file does
>> not reflect a valid disk state until the backup is done; just like for
>> qemu-img convert.  Just like qemu-img convert, there is therefore no
>> reason to flush the target until the job is done.
>>
>> But there is also the other case of image fleecing.  In this case, the
>> target will have another parent, so bdrv_flush_all() will be done by
>> someone.  Still, it intuitively makes sense to me that in this case, the
>> backup-top filter should flush the target if the guest decides to flush
>> the device (so that the target stays consistent on disk).
>>
>> backup-top currently cannot differentiate between the cases, but maybe
>> that is generally a useful hint to give to it?  (That is, whether the
>> target has a consistent state or not.)
> 
> Hmm, for fleecing we don't need consistent state of temporary image. If something fails,
> the whole operation is considered to be failed. And anyway, I don't see relations
> between consistency of temporary fleecing image and guest flushes, why should we
> bind them?

Guest flushes are about clearing the cache and keeping the disk state
consistent in case of e.g. power failure.  So if the target should be
consistent, it too should then survive a power failure.

Hm, but then again, if we don’t enforce on the host that the target is
always flushed before the source, that consistency is quickly lost when
the host decides to flush the cache just because it wants to.

Hmmm.  I guess we could say that the target is always inconsistent on
e.g. power failure.

My problem is that the target is also, well, a backup, you know.  So
it’s kind of a pain if we don’t care about it.  The user probably wanted
to save that old data, and now it may be lost just because we didn’t
flush it when the user entered “sync” in the VM.  I don’t know.

Max
Kevin Wolf June 17, 2019, 3:53 p.m. UTC | #8
Am 17.06.2019 um 16:56 hat Max Reitz geschrieben:
> On 17.06.19 12:36, Vladimir Sementsov-Ogievskiy wrote:
> > 14.06.2019 23:03, Max Reitz wrote:
> >> On 14.06.19 18:22, Vladimir Sementsov-Ogievskiy wrote:
> >>> 14.06.2019 15:57, Max Reitz wrote:
> >>>> On 14.06.19 11:04, Vladimir Sementsov-Ogievskiy wrote:
> >>>>> 13.06.2019 18:57, Max Reitz wrote:
> >>>>>> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
> >>>>>>> Backup-top filter does copy-before-write operation. It should be
> >>>>>>> inserted above active disk and has a target node for CBW, like the
> >>>>>>> following:
> >>>>>>>
> >>>>>>>        +-------+
> >>>>>>>        | Guest |
> >>>>>>>        +-------+
> >>>>>>>            |r,w
> >>>>>>>            v
> >>>>>>>        +------------+  target   +---------------+
> >>>>>>>        | backup_top |---------->| target(qcow2) |
> >>>>>>>        +------------+   CBW     +---------------+
> >>>>>>>            |
> >>>>>>> backing |r,w
> >>>>>>>            v
> >>>>>>>        +-------------+
> >>>>>>>        | Active disk |
> >>>>>>>        +-------------+
> >>>>>>>
> >>>>>>> The driver will be used in backup instead of write-notifiers.
> >>>>>>>
> >>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> >>>>>>> ---
> >>>>>>>     block/backup-top.h  |  64 +++++++++
> >>>>>>>     block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
> >>>>>>>     block/Makefile.objs |   2 +
> >>>>>>>     3 files changed, 388 insertions(+)
> >>>>>>>     create mode 100644 block/backup-top.h
> >>>>>>>     create mode 100644 block/backup-top.c
> >>>>>>>
> >>>>>>> diff --git a/block/backup-top.h b/block/backup-top.h
> >>>>>>> new file mode 100644
> >>>>>>> index 0000000000..788e18c358
> >>>>>>> --- /dev/null
> >>>>>>> +++ b/block/backup-top.h
> >>>>
> >>>> [...]
> >>>>
> >>>>>>> +/*
> >>>>>>> + * bdrv_backup_top_append
> >>>>>>> + *
> >>>>>>> + * Append backup_top filter node above @source node. @target node will receive
> >>>>>>> + * the data backed up during CBE operations. New filter together with @target
> >>>>>>> + * node are attached to @source aio context.
> >>>>>>> + *
> >>>>>>> + * The resulting filter node is implicit.
> >>>>>>
> >>>>>> Why?  It’s just as easy for the caller to just make it implicit if it
> >>>>>> should be.  (And usually the caller should decide that.)
> >>>>>
> >>>>> Personally, I don't know what are the reasons for filters to bi implicit or not,
> >>>>> I just made it like other job-filters.. I can move making-implicit to the caller
> >>>>> or drop it at all (if it will work).
> >>>>
> >>>> Nodes are implicit if they haven’t been added consciously by the user.
> >>>> A node added by a block job can be non-implicit, too, as mirror shows;
> >>>> If the user specifies the filter-node-name option, they will know about
> >>>> the node, thus it is no longer implicit.
> >>>>
> >>>> If the user doesn’t know about the node (they didn’t give the
> >>>> filter-node-name option), the node is implicit.
> >>>>
> >>>
> >>> Ok, I understand it. But it doesn't show, why it should be implicit?
> >>> Isn't it less buggy to make all filters explicit? We don't hide implicit nodes
> >>> from query-named-block-nodes (the only interface to explore the whole graph for now)
> >>> anyway. And we can't absolutely hide side effects of additional node in the graph.
> >>
> >> Well, we try, at least.  At least we hide them from query-block.
> >>
> >>> So, is there any real benefit of supporting separation into implicit and explicit filters?
> >>> It seems for me that it only complicates things...
> >>> In other words, what will break if we make all filters explicit?
> >>
> >> The theory is that qemu may decide to add nodes at any point, but at
> >> least when managing chains etc., they may not be visible to the user.  I
> >> don’t think we can get rid of them so easily.

I think the important point to understand here is that implicit filters
are a concept to maintain compatibility with legacy (pre-blockdev)
management tools which simply don't manage stuff on the node level. So
changing the structure of the graph is not a problem for them and we
just need to make sure that when they do something with a BlockBackend,
we perform the action on the node that they actually mean.

Modern management tools are expected to manage the graph on a node level
and to assign node names to everything.

Note that libvirt is close to becoming a modern management tool, so it's
probably a good idea to now make all block jobs add filters where we'll
need them in the long run.

> >> One example that isn’t implemented yet is copy-on-read.  In theory,
> >> specifying copy-on-read=on for -drive should create an implicit COR node
> >> on top.  The user shouldn’t see that node when inspecting the drive or
> >> when issuing block jobs on it, etc.  And this node has to stay there
> >> when the user does e.g. an active commit somewhere down the chain.
> > 
> > Why instead not to just write in doc that yes, filter is created when you
> > enable copy-on-read?
> 
> Because the problem is that existing users are not aware of that.
> 
> If they were, they could simply create a COR filter already.
> 
> I suppose we could interpret the deprecation policy in a way that we
> could change the behavior of -drive copy-on-read=on, but as I already
> said, what’s the point of supporting -drive copy-on-read=on, when you
> can simply explicitly create a COR filter on top?

I actually changed my opinion on how to do this since we introduced
implicit filters. I know think that the right way to move forward is to
make sure that the copy-on-read filter can do everything it needs to do,
and then just completely deprecate -drive copy-on-read=on instead of
adding compatibility magic to turn it into an implicit copy-on-read
filter internally.

> >> That sounds like a horrible ordeal to implement, so it hasn’t been done
> >> yet.  Maybe it never will.  It isn’t that bad for the job filters,
> >> because they generally freeze the block graph, so there is no problem
> >> with potential modifications.
> >>
> >> All in all I do think having implicit nodes makes sense.  Maybe not so
> >> much now, but in the future (if someone implements converting -drive COR
> >> and throttle options to implicit nodes...).
> > 
> > But do we have at least strong definition of how implicit nodes should behave
> > on any graph-modifying operations around them?
> 
> No.  Depends on the node.
> 
> > Should new implicit/explicit
> > filters be created above or under them?
> 
> That was always the most difficult question we had when we introduced
> filters.
> 
> The problem is that we never answered it in our code base.
> 
> One day, we just added filters (“let’s see what happens”), and in the
> beginning, they were all explicit, so we still didn’t have to answer
> this question (in code).  Then job filters were added, and we still
> didn’t have to, because they set blockers so the graph couldn’t be
> reorganized with them in place anyway.
> 
> If we converted copy-on-read=on to a COR node, we would have to answer
> that question.

That's why we shouldn't do that, but just remove copy-on-read=on. :-)

> >>> But should really filter do that job, or it is here only to do CBW? Maybe target
> >>> must have another parent which will care about flushing.
> >>>
> >>> Ok, I think I'll flush target here too for safety, and leave a comment, that something
> >>> smarter would be better.
> >>> (or, if we decide to flush all children by default in generic code, I'll drop this handler)
> >>
> >> [1] Thinking more about this, for normal backups the target file does
> >> not reflect a valid disk state until the backup is done; just like for
> >> qemu-img convert.  Just like qemu-img convert, there is therefore no
> >> reason to flush the target until the job is done.

Depends, the target could have the source as its backing file (like
fleecing, but without exporting it right now), and then you could always
have a consistent view on the target. Well, almost.

Almost because to guarantee this, we'd have to flush between each CBW
and the corresponding write to the same block, to make sure that the old
data is backed up before it is overwritten.

Of course, this would perform about as badly as internal COW in qcow2...
So probably not a guarantee we should be making by default. But it might
make sense as an option.

Kevin
Max Reitz June 17, 2019, 4:01 p.m. UTC | #9
On 17.06.19 17:53, Kevin Wolf wrote:
> Am 17.06.2019 um 16:56 hat Max Reitz geschrieben:
>> On 17.06.19 12:36, Vladimir Sementsov-Ogievskiy wrote:
>>> 14.06.2019 23:03, Max Reitz wrote:
>>>> On 14.06.19 18:22, Vladimir Sementsov-Ogievskiy wrote:
>>>>> 14.06.2019 15:57, Max Reitz wrote:
>>>>>> On 14.06.19 11:04, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>> 13.06.2019 18:57, Max Reitz wrote:
>>>>>>>> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>>> Backup-top filter does copy-before-write operation. It should be
>>>>>>>>> inserted above active disk and has a target node for CBW, like the
>>>>>>>>> following:
>>>>>>>>>
>>>>>>>>>        +-------+
>>>>>>>>>        | Guest |
>>>>>>>>>        +-------+
>>>>>>>>>            |r,w
>>>>>>>>>            v
>>>>>>>>>        +------------+  target   +---------------+
>>>>>>>>>        | backup_top |---------->| target(qcow2) |
>>>>>>>>>        +------------+   CBW     +---------------+
>>>>>>>>>            |
>>>>>>>>> backing |r,w
>>>>>>>>>            v
>>>>>>>>>        +-------------+
>>>>>>>>>        | Active disk |
>>>>>>>>>        +-------------+
>>>>>>>>>
>>>>>>>>> The driver will be used in backup instead of write-notifiers.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>>>>> ---
>>>>>>>>>     block/backup-top.h  |  64 +++++++++
>>>>>>>>>     block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>>     block/Makefile.objs |   2 +
>>>>>>>>>     3 files changed, 388 insertions(+)
>>>>>>>>>     create mode 100644 block/backup-top.h
>>>>>>>>>     create mode 100644 block/backup-top.c
>>>>>>>>>
>>>>>>>>> diff --git a/block/backup-top.h b/block/backup-top.h
>>>>>>>>> new file mode 100644
>>>>>>>>> index 0000000000..788e18c358
>>>>>>>>> --- /dev/null
>>>>>>>>> +++ b/block/backup-top.h
>>>>>>
>>>>>> [...]
>>>>>>
>>>>>>>>> +/*
>>>>>>>>> + * bdrv_backup_top_append
>>>>>>>>> + *
>>>>>>>>> + * Append backup_top filter node above @source node. @target node will receive
>>>>>>>>> + * the data backed up during CBE operations. New filter together with @target
>>>>>>>>> + * node are attached to @source aio context.
>>>>>>>>> + *
>>>>>>>>> + * The resulting filter node is implicit.
>>>>>>>>
>>>>>>>> Why?  It’s just as easy for the caller to just make it implicit if it
>>>>>>>> should be.  (And usually the caller should decide that.)
>>>>>>>
>>>>>>> Personally, I don't know what are the reasons for filters to bi implicit or not,
>>>>>>> I just made it like other job-filters.. I can move making-implicit to the caller
>>>>>>> or drop it at all (if it will work).
>>>>>>
>>>>>> Nodes are implicit if they haven’t been added consciously by the user.
>>>>>> A node added by a block job can be non-implicit, too, as mirror shows;
>>>>>> If the user specifies the filter-node-name option, they will know about
>>>>>> the node, thus it is no longer implicit.
>>>>>>
>>>>>> If the user doesn’t know about the node (they didn’t give the
>>>>>> filter-node-name option), the node is implicit.
>>>>>>
>>>>>
>>>>> Ok, I understand it. But it doesn't show, why it should be implicit?
>>>>> Isn't it less buggy to make all filters explicit? We don't hide implicit nodes
>>>>> from query-named-block-nodes (the only interface to explore the whole graph for now)
>>>>> anyway. And we can't absolutely hide side effects of additional node in the graph.
>>>>
>>>> Well, we try, at least.  At least we hide them from query-block.
>>>>
>>>>> So, is there any real benefit of supporting separation into implicit and explicit filters?
>>>>> It seems for me that it only complicates things...
>>>>> In other words, what will break if we make all filters explicit?
>>>>
>>>> The theory is that qemu may decide to add nodes at any point, but at
>>>> least when managing chains etc., they may not be visible to the user.  I
>>>> don’t think we can get rid of them so easily.
> 
> I think the important point to understand here is that implicit filters
> are a concept to maintain compatibility with legacy (pre-blockdev)
> management tools which simply don't manage stuff on the node level. So
> changing the structure of the graph is not a problem for them and we
> just need to make sure that when they do something with a BlockBackend,
> we perform the action on the node that they actually mean.
> 
> Modern management tools are expected to manage the graph on a node level
> and to assign node names to everything.
> 
> Note that libvirt is close to becoming a modern management tool, so it's
> probably a good idea to now make all block jobs add filters where we'll
> need them in the long run.
> 
>>>> One example that isn’t implemented yet is copy-on-read.  In theory,
>>>> specifying copy-on-read=on for -drive should create an implicit COR node
>>>> on top.  The user shouldn’t see that node when inspecting the drive or
>>>> when issuing block jobs on it, etc.  And this node has to stay there
>>>> when the user does e.g. an active commit somewhere down the chain.
>>>
>>> Why instead not to just write in doc that yes, filter is created when you
>>> enable copy-on-read?
>>
>> Because the problem is that existing users are not aware of that.
>>
>> If they were, they could simply create a COR filter already.
>>
>> I suppose we could interpret the deprecation policy in a way that we
>> could change the behavior of -drive copy-on-read=on, but as I already
>> said, what’s the point of supporting -drive copy-on-read=on, when you
>> can simply explicitly create a COR filter on top?
> 
> I actually changed my opinion on how to do this since we introduced
> implicit filters. I know think that the right way to move forward is to
> make sure that the copy-on-read filter can do everything it needs to do,
> and then just completely deprecate -drive copy-on-read=on instead of
> adding compatibility magic to turn it into an implicit copy-on-read
> filter internally.

Sure, so your answer to “what’s the point of supporting -drive
copy-on-read=on” is “there is none”.  Fair enough. :-)

>>>> That sounds like a horrible ordeal to implement, so it hasn’t been done
>>>> yet.  Maybe it never will.  It isn’t that bad for the job filters,
>>>> because they generally freeze the block graph, so there is no problem
>>>> with potential modifications.
>>>>
>>>> All in all I do think having implicit nodes makes sense.  Maybe not so
>>>> much now, but in the future (if someone implements converting -drive COR
>>>> and throttle options to implicit nodes...).
>>>
>>> But do we have at least strong definition of how implicit nodes should behave
>>> on any graph-modifying operations around them?
>>
>> No.  Depends on the node.
>>
>>> Should new implicit/explicit
>>> filters be created above or under them?
>>
>> That was always the most difficult question we had when we introduced
>> filters.
>>
>> The problem is that we never answered it in our code base.
>>
>> One day, we just added filters (“let’s see what happens”), and in the
>> beginning, they were all explicit, so we still didn’t have to answer
>> this question (in code).  Then job filters were added, and we still
>> didn’t have to, because they set blockers so the graph couldn’t be
>> reorganized with them in place anyway.
>>
>> If we converted copy-on-read=on to a COR node, we would have to answer
>> that question.
> 
> That's why we shouldn't do that, but just remove copy-on-read=on. :-)

And BB-level throttling, fair enough.

(Although the first step would be probably to make throttle groups
non-experimental?  Like, drop the x- prefix for all their parameters.)

>>>>> But should really filter do that job, or it is here only to do CBW? Maybe target
>>>>> must have another parent which will care about flushing.
>>>>>
>>>>> Ok, I think I'll flush target here too for safety, and leave a comment, that something
>>>>> smarter would be better.
>>>>> (or, if we decide to flush all children by default in generic code, I'll drop this handler)
>>>>
>>>> [1] Thinking more about this, for normal backups the target file does
>>>> not reflect a valid disk state until the backup is done; just like for
>>>> qemu-img convert.  Just like qemu-img convert, there is therefore no
>>>> reason to flush the target until the job is done.
> 
> Depends, the target could have the source as its backing file (like
> fleecing, but without exporting it right now), and then you could always
> have a consistent view on the target. Well, almost.
> 
> Almost because to guarantee this, we'd have to flush between each CBW
> and the corresponding write to the same block, to make sure that the old
> data is backed up before it is overwritten.

Yes, that’s what I meant by “enforce on the host that the target is
always flushed before the source”.  Well, I meant to say there is no
good way of doing that, and I kind of don’t consider this a good way.

> Of course, this would perform about as badly as internal COW in qcow2...
> So probably not a guarantee we should be making by default. But it might
> make sense as an option.

I don’t know.  “Use this option so your data isn’t lost in case of a
power failure, but it makes everything pretty slow”?  Who is going to do
explicitly enable that (before their first data loss)?

Max
Kevin Wolf June 17, 2019, 4:25 p.m. UTC | #10
Am 17.06.2019 um 18:01 hat Max Reitz geschrieben:
> >>> Should new implicit/explicit
> >>> filters be created above or under them?
> >>
> >> That was always the most difficult question we had when we introduced
> >> filters.
> >>
> >> The problem is that we never answered it in our code base.
> >>
> >> One day, we just added filters (“let’s see what happens”), and in the
> >> beginning, they were all explicit, so we still didn’t have to answer
> >> this question (in code).  Then job filters were added, and we still
> >> didn’t have to, because they set blockers so the graph couldn’t be
> >> reorganized with them in place anyway.
> >>
> >> If we converted copy-on-read=on to a COR node, we would have to answer
> >> that question.
> > 
> > That's why we shouldn't do that, but just remove copy-on-read=on. :-)
> 
> And BB-level throttling, fair enough.
> 
> (Although the first step would be probably to make throttle groups
> non-experimental?  Like, drop the x- prefix for all their parameters.)

The first step would be making the block drivers full replacements of
the old things, which unfortunately isn't true today.

After your "deal with filters" series, we should be a lot closer for the
core infrastructure at least.

Not sure about copy-on-read, but I know that throttle still doesn't have
feature parity with -drive throttling. At least I'm pretty sure that
something about changing the group or group options at runtime (and not
just dropping the x-) was missing.

> >>>>> But should really filter do that job, or it is here only to do CBW? Maybe target
> >>>>> must have another parent which will care about flushing.
> >>>>>
> >>>>> Ok, I think I'll flush target here too for safety, and leave a comment, that something
> >>>>> smarter would be better.
> >>>>> (or, if we decide to flush all children by default in generic code, I'll drop this handler)
> >>>>
> >>>> [1] Thinking more about this, for normal backups the target file does
> >>>> not reflect a valid disk state until the backup is done; just like for
> >>>> qemu-img convert.  Just like qemu-img convert, there is therefore no
> >>>> reason to flush the target until the job is done.
> > 
> > Depends, the target could have the source as its backing file (like
> > fleecing, but without exporting it right now), and then you could always
> > have a consistent view on the target. Well, almost.
> > 
> > Almost because to guarantee this, we'd have to flush between each CBW
> > and the corresponding write to the same block, to make sure that the old
> > data is backed up before it is overwritten.
> 
> Yes, that’s what I meant by “enforce on the host that the target is
> always flushed before the source”.  Well, I meant to say there is no
> good way of doing that, and I kind of don’t consider this a good way.
> 
> > Of course, this would perform about as badly as internal COW in qcow2...
> > So probably not a guarantee we should be making by default. But it might
> > make sense as an option.
> 
> I don’t know.  “Use this option so your data isn’t lost in case of a
> power failure, but it makes everything pretty slow”?  Who is going to do
> explicitly enable that (before their first data loss)?

Maybe the backup job wasn't that clever after all. At least if you care
about keeping the point-in-time snapshot at the start of the backup job
instead of just retrying and getting a snapshot of a different point in
time that is just as good.

If you do care about the specific point in time, the safe way to do it
is to take a snapshot and copy that away, and then delete the snapshot
again.

The only problem is that merging external snapshots is slow and you end
up writing the new data twice. If only we had a COW image format... :-)

Kevin
Vladimir Sementsov-Ogievskiy June 18, 2019, 7:19 a.m. UTC | #11
17.06.2019 19:25, Kevin Wolf wrote:
> Am 17.06.2019 um 18:01 hat Max Reitz geschrieben:
>>>>> Should new implicit/explicit
>>>>> filters be created above or under them?
>>>>
>>>> That was always the most difficult question we had when we introduced
>>>> filters.
>>>>
>>>> The problem is that we never answered it in our code base.
>>>>
>>>> One day, we just added filters (“let’s see what happens”), and in the
>>>> beginning, they were all explicit, so we still didn’t have to answer
>>>> this question (in code).  Then job filters were added, and we still
>>>> didn’t have to, because they set blockers so the graph couldn’t be
>>>> reorganized with them in place anyway.
>>>>
>>>> If we converted copy-on-read=on to a COR node, we would have to answer
>>>> that question.
>>>
>>> That's why we shouldn't do that, but just remove copy-on-read=on. :-)
>>
>> And BB-level throttling, fair enough.
>>
>> (Although the first step would be probably to make throttle groups
>> non-experimental?  Like, drop the x- prefix for all their parameters.)
> 
> The first step would be making the block drivers full replacements of
> the old things, which unfortunately isn't true today.
> 
> After your "deal with filters" series, we should be a lot closer for the
> core infrastructure at least.
> 
> Not sure about copy-on-read, but I know that throttle still doesn't have
> feature parity with -drive throttling. At least I'm pretty sure that
> something about changing the group or group options at runtime (and not
> just dropping the x-) was missing.

OK, thanks, I understand now that implicit filters are not just a feature but
compatibility mechanism.

So, can we at some point deprecate "optionality" of filter-node-name parameters of jobs,
in addition to deprecation of things like old copy-on-read option?
And actually deprecate implicit filters by this?

> 
>>>>>>> But should really filter do that job, or it is here only to do CBW? Maybe target
>>>>>>> must have another parent which will care about flushing.
>>>>>>>
>>>>>>> Ok, I think I'll flush target here too for safety, and leave a comment, that something
>>>>>>> smarter would be better.
>>>>>>> (or, if we decide to flush all children by default in generic code, I'll drop this handler)
>>>>>>
>>>>>> [1] Thinking more about this, for normal backups the target file does
>>>>>> not reflect a valid disk state until the backup is done; just like for
>>>>>> qemu-img convert.  Just like qemu-img convert, there is therefore no
>>>>>> reason to flush the target until the job is done.
>>>
>>> Depends, the target could have the source as its backing file (like
>>> fleecing, but without exporting it right now), and then you could always
>>> have a consistent view on the target. Well, almost.
>>>
>>> Almost because to guarantee this, we'd have to flush between each CBW
>>> and the corresponding write to the same block, to make sure that the old
>>> data is backed up before it is overwritten.
>>
>> Yes, that’s what I meant by “enforce on the host that the target is
>> always flushed before the source”.  Well, I meant to say there is no
>> good way of doing that, and I kind of don’t consider this a good way.
>>
>>> Of course, this would perform about as badly as internal COW in qcow2...
>>> So probably not a guarantee we should be making by default. But it might
>>> make sense as an option.
>>
>> I don’t know.  “Use this option so your data isn’t lost in case of a
>> power failure, but it makes everything pretty slow”?  Who is going to do
>> explicitly enable that (before their first data loss)?
> 
> Maybe the backup job wasn't that clever after all. At least if you care
> about keeping the point-in-time snapshot at the start of the backup job
> instead of just retrying and getting a snapshot of a different point in
> time that is just as good.
> 
> If you do care about the specific point in time, the safe way to do it
> is to take a snapshot and copy that away, and then delete the snapshot
> again.
> 
> The only problem is that merging external snapshots is slow and you end
> up writing the new data twice. If only we had a COW image format... :-)
> 

So, I still don't understand the reason of flushing backup target in a meantime.
Backup target remains invalid until the successful end of the job, at which point
we definitely flush target, but only once.

If node crashes during backup, backup would be invalid independently of were there
flushes after each write (or better just enable O_DIRECT) or not.
Vladimir Sementsov-Ogievskiy June 18, 2019, 7:25 a.m. UTC | #12
17.06.2019 17:56, Max Reitz wrote:
> On 17.06.19 12:36, Vladimir Sementsov-Ogievskiy wrote:
>> 14.06.2019 23:03, Max Reitz wrote:
>>> On 14.06.19 18:22, Vladimir Sementsov-Ogievskiy wrote:
>>>> 14.06.2019 15:57, Max Reitz wrote:
>>>>> On 14.06.19 11:04, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> 13.06.2019 18:57, Max Reitz wrote:
>>>>>>> On 29.05.19 17:46, Vladimir Sementsov-Ogievskiy wrote:
>>>>>>>> Backup-top filter does copy-before-write operation. It should be
>>>>>>>> inserted above active disk and has a target node for CBW, like the
>>>>>>>> following:
>>>>>>>>
>>>>>>>>         +-------+
>>>>>>>>         | Guest |
>>>>>>>>         +-------+
>>>>>>>>             |r,w
>>>>>>>>             v
>>>>>>>>         +------------+  target   +---------------+
>>>>>>>>         | backup_top |---------->| target(qcow2) |
>>>>>>>>         +------------+   CBW     +---------------+
>>>>>>>>             |
>>>>>>>> backing |r,w
>>>>>>>>             v
>>>>>>>>         +-------------+
>>>>>>>>         | Active disk |
>>>>>>>>         +-------------+
>>>>>>>>
>>>>>>>> The driver will be used in backup instead of write-notifiers.
>>>>>>>>
>>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>>>> ---
>>>>>>>>      block/backup-top.h  |  64 +++++++++
>>>>>>>>      block/backup-top.c  | 322 ++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>>      block/Makefile.objs |   2 +
>>>>>>>>      3 files changed, 388 insertions(+)
>>>>>>>>      create mode 100644 block/backup-top.h
>>>>>>>>      create mode 100644 block/backup-top.c
>>>>>>>>
>>>>>>>> diff --git a/block/backup-top.h b/block/backup-top.h
>>>>>>>> new file mode 100644
>>>>>>>> index 0000000000..788e18c358
>>>>>>>> --- /dev/null
>>>>>>>> +++ b/block/backup-top.h
>>>>>
>>>>> [...]
>>>>>
>>>>>>>> +/*
>>>>>>>> + * bdrv_backup_top_append
>>>>>>>> + *
>>>>>>>> + * Append backup_top filter node above @source node. @target node will receive
>>>>>>>> + * the data backed up during CBE operations. New filter together with @target
>>>>>>>> + * node are attached to @source aio context.
>>>>>>>> + *
>>>>>>>> + * The resulting filter node is implicit.
>>>>>>>
>>>>>>> Why?  It’s just as easy for the caller to just make it implicit if it
>>>>>>> should be.  (And usually the caller should decide that.)
>>>>>>
>>>>>> Personally, I don't know what are the reasons for filters to bi implicit or not,
>>>>>> I just made it like other job-filters.. I can move making-implicit to the caller
>>>>>> or drop it at all (if it will work).
>>>>>
>>>>> Nodes are implicit if they haven’t been added consciously by the user.
>>>>> A node added by a block job can be non-implicit, too, as mirror shows;
>>>>> If the user specifies the filter-node-name option, they will know about
>>>>> the node, thus it is no longer implicit.
>>>>>
>>>>> If the user doesn’t know about the node (they didn’t give the
>>>>> filter-node-name option), the node is implicit.
>>>>>
>>>>
>>>> Ok, I understand it. But it doesn't show, why it should be implicit?
>>>> Isn't it less buggy to make all filters explicit? We don't hide implicit nodes
>>>> from query-named-block-nodes (the only interface to explore the whole graph for now)
>>>> anyway. And we can't absolutely hide side effects of additional node in the graph.
>>>
>>> Well, we try, at least.  At least we hide them from query-block.
>>>
>>>> So, is there any real benefit of supporting separation into implicit and explicit filters?
>>>> It seems for me that it only complicates things...
>>>> In other words, what will break if we make all filters explicit?
>>>
>>> The theory is that qemu may decide to add nodes at any point, but at
>>> least when managing chains etc., they may not be visible to the user.  I
>>> don’t think we can get rid of them so easily.
>>>
>>> One example that isn’t implemented yet is copy-on-read.  In theory,
>>> specifying copy-on-read=on for -drive should create an implicit COR node
>>> on top.  The user shouldn’t see that node when inspecting the drive or
>>> when issuing block jobs on it, etc.  And this node has to stay there
>>> when the user does e.g. an active commit somewhere down the chain.
>>
>> Why instead not to just write in doc that yes, filter is created when you
>> enable copy-on-read?
> 
> Because the problem is that existing users are not aware of that.
> 
> If they were, they could simply create a COR filter already.
> 
> I suppose we could interpret the deprecation policy in a way that we
> could change the behavior of -drive copy-on-read=on, but as I already
> said, what’s the point of supporting -drive copy-on-read=on, when you
> can simply explicitly create a COR filter on top?
> 
>> And do same for all operations which may create filter,
>> we don't have a lot of them? And add optional parameter to set node-name for
>> created filter.
> 
> That optional parameter exists, and if it is given, the user shows that
> they are aware of the node.  Hence the node becomes explicit then.
> 
> The problem is with legacy users that use an existing interface and
> suddenly the externally visible interface of qemu changes in a way that
> could break them.  I suppose additional entries in
> query-named-block-nodes is not breakage.  Maybe it is, that would be a
> bug then.
> 
> (If nobody has noticed so far, that may be because these legacy
> applications didn’t use query-named-block-nodes.  But now maybe libvirt
> does, and so if we converted copy-on-read=on to a COR node, it would
> notice now and thus break.  So just because our handling of implicit
> nodes is broken right now does not mean we can abandon the concept
> altogether.)
> 
>>> That sounds like a horrible ordeal to implement, so it hasn’t been done
>>> yet.  Maybe it never will.  It isn’t that bad for the job filters,
>>> because they generally freeze the block graph, so there is no problem
>>> with potential modifications.
>>>
>>> All in all I do think having implicit nodes makes sense.  Maybe not so
>>> much now, but in the future (if someone implements converting -drive COR
>>> and throttle options to implicit nodes...).
>>
>> But do we have at least strong definition of how implicit nodes should behave
>> on any graph-modifying operations around them?
> 
> No.  Depends on the node.
> 
>> Should new implicit/explicit
>> filters be created above or under them?
> 
> That was always the most difficult question we had when we introduced
> filters.
> 
> The problem is that we never answered it in our code base.
> 
> One day, we just added filters (“let’s see what happens”), and in the
> beginning, they were all explicit, so we still didn’t have to answer
> this question (in code).  Then job filters were added, and we still
> didn’t have to, because they set blockers so the graph couldn’t be
> reorganized with them in place anyway.
> 
> If we converted copy-on-read=on to a COR node, we would have to answer
> that question.
> 
>> We should specify it in doc about all
>> such operations, otherwise effect of implicit nodes may change unpredictably for
>> the user. I'm afraid, that implicit filters will just continue my list of
>> features which should not be used if we want to be able to maintain all this mess..
> 
> Yeah, well, that is nothing new at all.  For example, note the “Split
> I/O throttling off into own BDS – Requires some care with snapshots
> etc.” here:
> https://wiki.qemu.org/ToDo/Block#Dynamic_graph_reconfiguration_.28e.g._adding_block_filters.2C_taking_snapshots.2C_etc..29
> 
> This was always a problem.  It was always the biggest problem with
> filters.  We never answered it because it never become an acute problem,
> as I described above.
> 
> We just said “For explicit filters, they just stay where they are, and
> that’s OK because the user can take care of them.”  We never did add the
> implicit filters that were actually going to become a problem (COR and
> throttle).
> 
> So as I said above, the behavior of implicit nodes needs to be examined
> on a case-by-case basis.  Well, actually, this is only about COR and
> throttle, really, and both would behave the same: Always stay at the
> BlockBackend, because that is the behavior when you currently use
> BB-level throttling or COR.
> 
> The jobs that now use implicit nodes freeze the graph, so there is no
> problem there.
> 
> (Also, by the way, we do not need the option to give COR/throttle nodes
> created through BB options a node name.  As I said above, at that point
> you can just create the node yourself.)
> 
>> 1. If you something around filenames don't work, use node-names, and better
>> don't use filename-based APIs
> 
> I think we agree that filename-based APIs were a mistake.  (Though only
> in hindsight.  They were added when there where no node names, so it did
> make sense then.)
> 
> So now you should not use them, correct.
> 
>> 2. If you qemu-img can't do what you need, use paused qemu process, and better,
>> don't use qemu-img
> 
> There was always a plan to give qemu-img feature parity to qemu proper.
>   But often the interface is just a problem.  Node names really don’t
> make much sense with qemu-img, I think.
> 
> (I mean, feel free to add something that automatically names the backing
> nodes e.g. backing.0, backing.1, etc., and then add an interface to
> qemu-img commit to use node names, but I find that weird.)
> 
> So I don’t quite see the problem.  Of course qemu has more functionality
> than qemu-img.  qemu-img is fine as long as you have a simple set-up,
> but once it gets more complicated, you need management software; and
> that software shouldn’t care much whether it uses qemu-img or qemu itself.
> 
>> and a new one:
>>
>> 3. If something is broken around jobs and filters and any block operations,
>> set filter-node-name in parameters to make filter node explicit. And better,
>> always set it..
> 
> Uh, yeah?  I don’t see the problem here.
> 
> This is like saying “If I want to use new features, I should use
> -blockdev and not -hda”.  Yes, that is correct.  You should set
> filter-node-name if you care about the exact state of the graph.

OK, yes, understand now. Sorry for me listing quite unrelated things, but the
picture is clearer for me now, thanks.

> 
>> So at least, I think we should make it possible for all filters to be explicit
>> if user wants, by setting its name like in mirror.
> 
> For all filters that the user has no other way of creating them, yes.
> (That is, the filters created by the block jobs.)
> 
>> (and then, I will not really care of implicit-filters related logic, like I don't
>> really care about filename-api related logic)
>>
>> Also, now, implict filters are actually available for direct manipulation by user,
>> as their node-names are exported in query-named-block-nodes and nothing prevents
>> using these names in differet APIs.
> 
> Hm, yes.  Is that a problem?

Not. I tried to prove that implicit filters are "bad feature", but now when I see
that it's for compatibility, these reasons become weak.

[..]
Kevin Wolf June 18, 2019, 8:20 a.m. UTC | #13
Am 18.06.2019 um 09:19 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 17.06.2019 19:25, Kevin Wolf wrote:
> > The first step would be making the block drivers full replacements of
> > the old things, which unfortunately isn't true today.
> > 
> > After your "deal with filters" series, we should be a lot closer for the
> > core infrastructure at least.
> > 
> > Not sure about copy-on-read, but I know that throttle still doesn't have
> > feature parity with -drive throttling. At least I'm pretty sure that
> > something about changing the group or group options at runtime (and not
> > just dropping the x-) was missing.
> 
> OK, thanks, I understand now that implicit filters are not just a
> feature but compatibility mechanism.
> 
> So, can we at some point deprecate "optionality" of filter-node-name
> parameters of jobs, in addition to deprecation of things like old
> copy-on-read option?  And actually deprecate implicit filters by this?

Hm, I'm not sure if we have ever moved an optional feature to required,
and how to communicate this to libvirt, but this would be ideal, yes.

> >>>>>>> But should really filter do that job, or it is here only to do CBW? Maybe target
> >>>>>>> must have another parent which will care about flushing.
> >>>>>>>
> >>>>>>> Ok, I think I'll flush target here too for safety, and leave a comment, that something
> >>>>>>> smarter would be better.
> >>>>>>> (or, if we decide to flush all children by default in generic code, I'll drop this handler)
> >>>>>>
> >>>>>> [1] Thinking more about this, for normal backups the target file does
> >>>>>> not reflect a valid disk state until the backup is done; just like for
> >>>>>> qemu-img convert.  Just like qemu-img convert, there is therefore no
> >>>>>> reason to flush the target until the job is done.
> >>>
> >>> Depends, the target could have the source as its backing file (like
> >>> fleecing, but without exporting it right now), and then you could always
> >>> have a consistent view on the target. Well, almost.
> >>>
> >>> Almost because to guarantee this, we'd have to flush between each CBW
> >>> and the corresponding write to the same block, to make sure that the old
> >>> data is backed up before it is overwritten.
> >>
> >> Yes, that’s what I meant by “enforce on the host that the target is
> >> always flushed before the source”.  Well, I meant to say there is no
> >> good way of doing that, and I kind of don’t consider this a good way.
> >>
> >>> Of course, this would perform about as badly as internal COW in qcow2...
> >>> So probably not a guarantee we should be making by default. But it might
> >>> make sense as an option.
> >>
> >> I don’t know.  “Use this option so your data isn’t lost in case of a
> >> power failure, but it makes everything pretty slow”?  Who is going to do
> >> explicitly enable that (before their first data loss)?
> > 
> > Maybe the backup job wasn't that clever after all. At least if you care
> > about keeping the point-in-time snapshot at the start of the backup job
> > instead of just retrying and getting a snapshot of a different point in
> > time that is just as good.
> > 
> > If you do care about the specific point in time, the safe way to do it
> > is to take a snapshot and copy that away, and then delete the snapshot
> > again.
> > 
> > The only problem is that merging external snapshots is slow and you end
> > up writing the new data twice. If only we had a COW image format... :-)
> 
> So, I still don't understand the reason of flushing backup target in a
> meantime.  Backup target remains invalid until the successful end of
> the job, at which point we definitely flush target, but only once.
> 
> If node crashes during backup, backup would be invalid independently
> of were there flushes after each write (or better just enable
> O_DIRECT) or not.

Say your VM is running on disk.qcow2 and you use a backup job to copy
data to backup.qcow2. At some point, the host crashes. If we made sure
that every CBW to backup.qcow2 has completed before we write new data to
disk.qcow2, backup.qcow2 contains valid data that represents the state
at the start of the backup job as long as you use disk.qcow2 as its
backing file.

The only way to ensure the right order is flushing between the two
operations. If you don't do that, then yes, your backing is invalid
before it has completed.

O_DIRECT doesn't guarantee that the data is on disk, only a flush does
that. Maybe your chances that things actually make it to the disk in
case of a host crash are higher if it sits in the disk cache rather than
in the host's RAM, but there is no guarantee without a flush.

Now flushing the target when the guest flushes its disk doesn't give you
new guarantees. Maybe it increases your chances that you're lucky and
your data is correct, but you won't be able to tell. So... not really a
reason not to do it, but it's probably kind of useless.

Kevin
Vladimir Sementsov-Ogievskiy June 18, 2019, 8:29 a.m. UTC | #14
18.06.2019 11:20, Kevin Wolf wrote:
> Am 18.06.2019 um 09:19 hat Vladimir Sementsov-Ogievskiy geschrieben:
>> 17.06.2019 19:25, Kevin Wolf wrote:
>>> The first step would be making the block drivers full replacements of
>>> the old things, which unfortunately isn't true today.
>>>
>>> After your "deal with filters" series, we should be a lot closer for the
>>> core infrastructure at least.
>>>
>>> Not sure about copy-on-read, but I know that throttle still doesn't have
>>> feature parity with -drive throttling. At least I'm pretty sure that
>>> something about changing the group or group options at runtime (and not
>>> just dropping the x-) was missing.
>>
>> OK, thanks, I understand now that implicit filters are not just a
>> feature but compatibility mechanism.
>>
>> So, can we at some point deprecate "optionality" of filter-node-name
>> parameters of jobs, in addition to deprecation of things like old
>> copy-on-read option?  And actually deprecate implicit filters by this?
> 
> Hm, I'm not sure if we have ever moved an optional feature to required,
> and how to communicate this to libvirt, but this would be ideal, yes.
> 
>>>>>>>>> But should really filter do that job, or it is here only to do CBW? Maybe target
>>>>>>>>> must have another parent which will care about flushing.
>>>>>>>>>
>>>>>>>>> Ok, I think I'll flush target here too for safety, and leave a comment, that something
>>>>>>>>> smarter would be better.
>>>>>>>>> (or, if we decide to flush all children by default in generic code, I'll drop this handler)
>>>>>>>>
>>>>>>>> [1] Thinking more about this, for normal backups the target file does
>>>>>>>> not reflect a valid disk state until the backup is done; just like for
>>>>>>>> qemu-img convert.  Just like qemu-img convert, there is therefore no
>>>>>>>> reason to flush the target until the job is done.
>>>>>
>>>>> Depends, the target could have the source as its backing file (like
>>>>> fleecing, but without exporting it right now), and then you could always
>>>>> have a consistent view on the target. Well, almost.
>>>>>
>>>>> Almost because to guarantee this, we'd have to flush between each CBW
>>>>> and the corresponding write to the same block, to make sure that the old
>>>>> data is backed up before it is overwritten.
>>>>
>>>> Yes, that’s what I meant by “enforce on the host that the target is
>>>> always flushed before the source”.  Well, I meant to say there is no
>>>> good way of doing that, and I kind of don’t consider this a good way.
>>>>
>>>>> Of course, this would perform about as badly as internal COW in qcow2...
>>>>> So probably not a guarantee we should be making by default. But it might
>>>>> make sense as an option.
>>>>
>>>> I don’t know.  “Use this option so your data isn’t lost in case of a
>>>> power failure, but it makes everything pretty slow”?  Who is going to do
>>>> explicitly enable that (before their first data loss)?
>>>
>>> Maybe the backup job wasn't that clever after all. At least if you care
>>> about keeping the point-in-time snapshot at the start of the backup job
>>> instead of just retrying and getting a snapshot of a different point in
>>> time that is just as good.
>>>
>>> If you do care about the specific point in time, the safe way to do it
>>> is to take a snapshot and copy that away, and then delete the snapshot
>>> again.
>>>
>>> The only problem is that merging external snapshots is slow and you end
>>> up writing the new data twice. If only we had a COW image format... :-)
>>
>> So, I still don't understand the reason of flushing backup target in a
>> meantime.  Backup target remains invalid until the successful end of
>> the job, at which point we definitely flush target, but only once.
>>
>> If node crashes during backup, backup would be invalid independently
>> of were there flushes after each write (or better just enable
>> O_DIRECT) or not.
> 
> Say your VM is running on disk.qcow2 and you use a backup job to copy
> data to backup.qcow2. At some point, the host crashes. If we made sure
> that every CBW to backup.qcow2 has completed before we write new data to
> disk.qcow2, backup.qcow2 contains valid data that represents the state
> at the start of the backup job as long as you use disk.qcow2 as its
> backing file.
> 
> The only way to ensure the right order is flushing between the two
> operations. If you don't do that, then yes, your backing is invalid
> before it has completed.
> 
> O_DIRECT doesn't guarantee that the data is on disk, only a flush does
> that. Maybe your chances that things actually make it to the disk in
> case of a host crash are higher if it sits in the disk cache rather than
> in the host's RAM, but there is no guarantee without a flush.
> 
> Now flushing the target when the guest flushes its disk doesn't give you
> new guarantees. Maybe it increases your chances that you're lucky and
> your data is correct, but you won't be able to tell. So... not really a
> reason not to do it, but it's probably kind of useless.
> 

All clear now, thanks!
diff mbox series

Patch

diff --git a/block/backup-top.h b/block/backup-top.h
new file mode 100644
index 0000000000..788e18c358
--- /dev/null
+++ b/block/backup-top.h
@@ -0,0 +1,64 @@ 
+/*
+ * backup-top filter driver
+ *
+ * The driver performs Copy-Before-Write (CBW) operation: it is injected above
+ * some node, and before each write it copies _old_ data to the target node.
+ *
+ * Copyright (c) 2018 Virtuozzo International GmbH. All rights reserved.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BACKUP_TOP_H
+#define BACKUP_TOP_H
+
+#include "qemu/osdep.h"
+
+#include "block/block_int.h"
+
+typedef void (*BackupTopProgressCallback)(uint64_t done, void *opaque);
+typedef struct BDRVBackupTopState {
+    HBitmap *copy_bitmap; /* what should be copied to @target on guest write. */
+    BdrvChild *target;
+
+    BackupTopProgressCallback progress_cb;
+    void *progress_opaque;
+} BDRVBackupTopState;
+
+/*
+ * bdrv_backup_top_append
+ *
+ * Append backup_top filter node above @source node. @target node will receive
+ * the data backed up during CBE operations. New filter together with @target
+ * node are attached to @source aio context.
+ *
+ * The resulting filter node is implicit.
+ *
+ * @copy_bitmap selects regions which needs CBW. Furthermore, backup_top will
+ * use exactly this bitmap, so it may be used to control backup_top behavior
+ * dynamically. Caller should not release @copy_bitmap during life-time of
+ * backup_top. Progress is tracked by calling @progress_cb function.
+ */
+BlockDriverState *bdrv_backup_top_append(
+        BlockDriverState *source, BlockDriverState *target,
+        HBitmap *copy_bitmap, Error **errp);
+void bdrv_backup_top_set_progress_callback(
+        BlockDriverState *bs, BackupTopProgressCallback progress_cb,
+        void *progress_opaque);
+void bdrv_backup_top_drop(BlockDriverState *bs);
+
+#endif /* BACKUP_TOP_H */
diff --git a/block/backup-top.c b/block/backup-top.c
new file mode 100644
index 0000000000..1daa02f539
--- /dev/null
+++ b/block/backup-top.c
@@ -0,0 +1,322 @@ 
+/*
+ * backup-top filter driver
+ *
+ * The driver performs Copy-Before-Write (CBW) operation: it is injected above
+ * some node, and before each write it copies _old_ data to the target node.
+ *
+ * Copyright (c) 2018 Virtuozzo International GmbH. All rights reserved.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "block/block_int.h"
+#include "block/qdict.h"
+
+#include "block/backup-top.h"
+
+static coroutine_fn int backup_top_co_preadv(
+        BlockDriverState *bs, uint64_t offset, uint64_t bytes,
+        QEMUIOVector *qiov, int flags)
+{
+    /*
+     * Features to be implemented:
+     * F1. COR. save read data to fleecing target for fast access
+     *     (to reduce reads). This possibly may be done with use of copy-on-read
+     *     filter, but we need an ability to make COR requests optional: for
+     *     example, if target is a ram-cache, and if it is full now, we should
+     *     skip doing COR request, as it is actually not necessary.
+     *
+     * F2. Feature for guest: read from fleecing target if data is in ram-cache
+     *     and is unchanged
+     */
+
+    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
+}
+
+static coroutine_fn int backup_top_cbw(BlockDriverState *bs, uint64_t offset,
+                                       uint64_t bytes)
+{
+    int ret = 0;
+    BDRVBackupTopState *s = bs->opaque;
+    uint64_t gran = 1UL << hbitmap_granularity(s->copy_bitmap);
+    uint64_t end = QEMU_ALIGN_UP(offset + bytes, gran);
+    uint64_t off = QEMU_ALIGN_DOWN(offset, gran), len;
+    void *buf = qemu_blockalign(bs, end - off);
+
+    /*
+     * Features to be implemented:
+     * F3. parallelize copying loop
+     * F4. detect zeroes ? or, otherwise, drop detect zeroes from backup code
+     *     and just enable zeroes detecting on target
+     * F5. use block_status ?
+     * F6. don't copy clusters which are already cached by COR [see F1]
+     * F7. if target is ram-cache and it is full, there should be a possibility
+     *     to drop not necessary data (cached by COR [see F1]) to handle CBW
+     *     fast.
+     */
+
+    len = end - off;
+    while (hbitmap_next_dirty_area(s->copy_bitmap, &off, &len)) {
+        hbitmap_reset(s->copy_bitmap, off, len);
+
+        ret = bdrv_co_pread(bs->backing, off, len, buf,
+                            BDRV_REQ_NO_SERIALISING);
+        if (ret < 0) {
+            hbitmap_set(s->copy_bitmap, off, len);
+            goto out;
+        }
+
+        ret = bdrv_co_pwrite(s->target, off, len, buf, BDRV_REQ_SERIALISING);
+        if (ret < 0) {
+            hbitmap_set(s->copy_bitmap, off, len);
+            goto out;
+        }
+
+        if (s->progress_cb) {
+            s->progress_cb(len, s->progress_opaque);
+        }
+        off += len;
+        if (off >= end) {
+            break;
+        }
+        len = end - off;
+    }
+
+out:
+    qemu_vfree(buf);
+
+    /*
+     * F8. we fail guest request in case of error. We can alter it by
+     * possibility to fail copying process instead, or retry several times, or
+     * may be guest pause, etc.
+     */
+    return ret;
+}
+
+static int coroutine_fn backup_top_co_pdiscard(BlockDriverState *bs,
+                                               int64_t offset, int bytes)
+{
+    int ret = backup_top_cbw(bs, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /*
+     * Features to be implemented:
+     * F9. possibility of lazy discard: just defer the discard after fleecing
+     *     completion. If write (or new discard) occurs to the same area, just
+     *     drop deferred discard.
+     */
+
+    return bdrv_co_pdiscard(bs->backing, offset, bytes);
+}
+
+static int coroutine_fn backup_top_co_pwrite_zeroes(BlockDriverState *bs,
+        int64_t offset, int bytes, BdrvRequestFlags flags)
+{
+    int ret = backup_top_cbw(bs, offset, bytes);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
+}
+
+static coroutine_fn int backup_top_co_pwritev(BlockDriverState *bs,
+                                              uint64_t offset,
+                                              uint64_t bytes,
+                                              QEMUIOVector *qiov, int flags)
+{
+    if (!(flags & BDRV_REQ_WRITE_UNCHANGED)) {
+        int ret = backup_top_cbw(bs, offset, bytes);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    return bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
+}
+
+static int coroutine_fn backup_top_co_flush(BlockDriverState *bs)
+{
+    if (!bs->backing) {
+        return 0;
+    }
+
+    return bdrv_co_flush(bs->backing->bs);
+}
+
+static void backup_top_refresh_filename(BlockDriverState *bs)
+{
+    if (bs->backing == NULL) {
+        /*
+         * we can be here after failed bdrv_attach_child in
+         * bdrv_set_backing_hd
+         */
+        return;
+    }
+    bdrv_refresh_filename(bs->backing->bs);
+    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
+            bs->backing->bs->filename);
+}
+
+static void backup_top_child_perm(BlockDriverState *bs, BdrvChild *c,
+                                  const BdrvChildRole *role,
+                                  BlockReopenQueue *reopen_queue,
+                                  uint64_t perm, uint64_t shared,
+                                  uint64_t *nperm, uint64_t *nshared)
+{
+    /*
+     * We have HBitmap in the state, its size is fixed, so we never allow
+     * resize.
+     */
+    uint64_t rw = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
+                  BLK_PERM_WRITE;
+
+    bdrv_filter_default_perms(bs, c, role, reopen_queue, perm, shared,
+                              nperm, nshared);
+
+    *nperm = *nperm & rw;
+    *nshared = *nshared & rw;
+
+    if (role == &child_file) {
+        /*
+         * Target child
+         *
+         * Share write to target (child_file), to not interfere
+         * with guest writes to its disk which may be in target backing chain.
+         */
+        if (perm & BLK_PERM_WRITE) {
+            *nshared = *nshared | BLK_PERM_WRITE;
+        }
+    } else {
+        /* Source child */
+        if (perm & BLK_PERM_WRITE) {
+            *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
+        }
+        *nshared =
+            *nshared & (BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED);
+    }
+}
+
+BlockDriver bdrv_backup_top_filter = {
+    .format_name = "backup-top",
+    .instance_size = sizeof(BDRVBackupTopState),
+
+    .bdrv_co_preadv             = backup_top_co_preadv,
+    .bdrv_co_pwritev            = backup_top_co_pwritev,
+    .bdrv_co_pwrite_zeroes      = backup_top_co_pwrite_zeroes,
+    .bdrv_co_pdiscard           = backup_top_co_pdiscard,
+    .bdrv_co_flush              = backup_top_co_flush,
+
+    .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
+
+    .bdrv_refresh_filename      = backup_top_refresh_filename,
+
+    .bdrv_child_perm            = backup_top_child_perm,
+
+    .is_filter = true,
+};
+
+BlockDriverState *bdrv_backup_top_append(BlockDriverState *source,
+                                         BlockDriverState *target,
+                                         HBitmap *copy_bitmap,
+                                         Error **errp)
+{
+    Error *local_err = NULL;
+    BDRVBackupTopState *state;
+    BlockDriverState *top = bdrv_new_open_driver(&bdrv_backup_top_filter,
+                                                 NULL, BDRV_O_RDWR, errp);
+
+    if (!top) {
+        return NULL;
+    }
+
+    top->implicit = true;
+    top->total_sectors = source->total_sectors;
+    top->bl.opt_mem_alignment = MAX(bdrv_opt_mem_align(source),
+                                    bdrv_opt_mem_align(target));
+    top->opaque = state = g_new0(BDRVBackupTopState, 1);
+    state->copy_bitmap = copy_bitmap;
+
+    bdrv_ref(target);
+    state->target = bdrv_attach_child(top, target, "target", &child_file, errp);
+    if (!state->target) {
+        bdrv_unref(target);
+        bdrv_unref(top);
+        return NULL;
+    }
+
+    bdrv_set_aio_context(top, bdrv_get_aio_context(source));
+    bdrv_set_aio_context(target, bdrv_get_aio_context(source));
+
+    bdrv_drained_begin(source);
+
+    bdrv_ref(top);
+    bdrv_append(top, source, &local_err);
+    if (local_err) {
+        error_prepend(&local_err, "Cannot append backup-top filter: ");
+    }
+
+    bdrv_drained_end(source);
+
+    if (local_err) {
+        bdrv_unref_child(top, state->target);
+        bdrv_unref(top);
+        error_propagate(errp, local_err);
+        return NULL;
+    }
+
+    return top;
+}
+
+void bdrv_backup_top_set_progress_callback(
+        BlockDriverState *bs, BackupTopProgressCallback progress_cb,
+        void *progress_opaque)
+{
+    BDRVBackupTopState *s = bs->opaque;
+
+    s->progress_cb = progress_cb;
+    s->progress_opaque = progress_opaque;
+}
+
+void bdrv_backup_top_drop(BlockDriverState *bs)
+{
+    BDRVBackupTopState *s = bs->opaque;
+    AioContext *aio_context = bdrv_get_aio_context(bs);
+
+    aio_context_acquire(aio_context);
+
+    bdrv_drained_begin(bs);
+
+    bdrv_child_try_set_perm(bs->backing, 0, BLK_PERM_ALL, &error_abort);
+    bdrv_replace_node(bs, backing_bs(bs), &error_abort);
+    bdrv_set_backing_hd(bs, NULL, &error_abort);
+
+    bdrv_drained_end(bs);
+
+    if (s->target) {
+        bdrv_unref_child(bs, s->target);
+    }
+    bdrv_unref(bs);
+
+    aio_context_release(aio_context);
+}
diff --git a/block/Makefile.objs b/block/Makefile.objs
index ae11605c9f..dfbdfe6ab4 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -40,6 +40,8 @@  block-obj-y += throttle.o copy-on-read.o
 
 block-obj-y += crypto.o
 
+block-obj-y += backup-top.o
+
 common-obj-y += stream.o
 
 nfs.o-libs         := $(LIBNFS_LIBS)