diff mbox series

block/replication.c: Properly attach children

Message ID 20210706181138.1c0bacd8@gecko.fritz.box
State New
Headers show
Series block/replication.c: Properly attach children | expand

Commit Message

Lukas Straub July 6, 2021, 4:11 p.m. UTC
The replication driver needs access to the children block-nodes of
it's child so it can issue bdrv_make_empty to manage the replication.
However, it does this by directly copying the BdrvChilds, which is
wrong.

Fix this by properly attaching the block-nodes with
bdrv_attach_child().

Also, remove a workaround introduced in commit
6ecbc6c52672db5c13805735ca02784879ce8285
"replication: Avoid blk_make_empty() on read-only child".

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---

-v2: Test for BDRV_CHILD_PRIMARY in replication_child_perm, since
     bs->file might not be set yet. (Vladimir)

 block/replication.c | 94 +++++++++++++++++++++++++++++----------------
 1 file changed, 61 insertions(+), 33 deletions(-)

Comments

Vladimir Sementsov-Ogievskiy July 7, 2021, 1:01 p.m. UTC | #1
06.07.2021 19:11, Lukas Straub wrote:
> The replication driver needs access to the children block-nodes of
> it's child so it can issue bdrv_make_empty to manage the replication.
> However, it does this by directly copying the BdrvChilds, which is
> wrong.
> 
> Fix this by properly attaching the block-nodes with
> bdrv_attach_child().
> 
> Also, remove a workaround introduced in commit
> 6ecbc6c52672db5c13805735ca02784879ce8285
> "replication: Avoid blk_make_empty() on read-only child".
> 
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
> 
> -v2: Test for BDRV_CHILD_PRIMARY in replication_child_perm, since
>       bs->file might not be set yet. (Vladimir)
> 
>   block/replication.c | 94 +++++++++++++++++++++++++++++----------------
>   1 file changed, 61 insertions(+), 33 deletions(-)
> 
> diff --git a/block/replication.c b/block/replication.c
> index 52163f2d1f..fd8cb728a3 100644
> --- a/block/replication.c
> +++ b/block/replication.c
> @@ -166,7 +166,12 @@ static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
>                                      uint64_t perm, uint64_t shared,
>                                      uint64_t *nperm, uint64_t *nshared)
>   {
> -    *nperm = BLK_PERM_CONSISTENT_READ;
> +    if (role & BDRV_CHILD_PRIMARY) {
> +        *nperm = BLK_PERM_CONSISTENT_READ;
> +    } else {
> +        *nperm = 0;
> +    }

Why you drop READ access for other children? You don't mention it in commit-msg..

Upd: ok now I see that we are not going to read from hidden_disk child, and that's the only "other" child that worth to mention.
Still, we should be sure that hidden_disk child gets WRITE permission in case we are going to call bdrv_make_empty on it.

> +
>       if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
>           *nperm |= BLK_PERM_WRITE;
>       }
> @@ -340,17 +345,7 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
>           return;
>       }
>   
> -    BlockBackend *blk = blk_new(qemu_get_current_aio_context(),
> -                                BLK_PERM_WRITE, BLK_PERM_ALL);
> -    blk_insert_bs(blk, s->hidden_disk->bs, &local_err);
> -    if (local_err) {
> -        error_propagate(errp, local_err);
> -        blk_unref(blk);
> -        return;
> -    }
> -
> -    ret = blk_make_empty(blk, errp);
> -    blk_unref(blk);
> +    ret = bdrv_make_empty(s->hidden_disk, errp);

So, here you rely on BLK_PERM_WRITE being set in replication_child_perm().. Probably that's OK, however logic is changed. Shouldn't we always require write permission in replication_child_perm() for hidden_disk ?

>       if (ret < 0) {
>           return;
>       }
> @@ -365,27 +360,35 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
>                                   Error **errp)
>   {
>       BDRVReplicationState *s = bs->opaque;
> +    BdrvChild *hidden_disk, *secondary_disk;
>       BlockReopenQueue *reopen_queue = NULL;
>   
> +    /*
> +     * s->hidden_disk and s->secondary_disk may not be set yet, as they will
> +     * only be set after the children are writable.
> +     */
> +    hidden_disk = bs->file->bs->backing;
> +    secondary_disk = hidden_disk->bs->backing;
> +
>       if (writable) {
> -        s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
> -        s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
> +        s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs);
> +        s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
>       }
>   
> -    bdrv_subtree_drained_begin(s->hidden_disk->bs);
> -    bdrv_subtree_drained_begin(s->secondary_disk->bs);
> +    bdrv_subtree_drained_begin(hidden_disk->bs);
> +    bdrv_subtree_drained_begin(secondary_disk->bs);

That kind of staff may be done as a separate preparation patch, with no-logic-change refactoring, this makes the main logic-change patch simpler.

>   
>       if (s->orig_hidden_read_only) {
>           QDict *opts = qdict_new();
>           qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
> -        reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
> +        reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs,
>                                            opts, true);
>       }
>   
>       if (s->orig_secondary_read_only) {
>           QDict *opts = qdict_new();
>           qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
> -        reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
> +        reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs,
>                                            opts, true);
>       }, probably it could be a separate patch if it is needed.
>   
> @@ -393,8 +396,8 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
>           bdrv_reopen_multiple(reopen_queue, errp);
>       }
>   
> -    bdrv_subtree_drained_end(s->hidden_disk->bs);
> -    bdrv_subtree_drained_end(s->secondary_disk->bs);
> +    bdrv_subtree_drained_end(hidden_disk->bs);
> +    bdrv_subtree_drained_end(secondary_disk->bs);
>   }
>   
>   static void backup_job_cleanup(BlockDriverState *bs)
> @@ -451,6 +454,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
>       BlockDriverState *bs = rs->opaque;
>       BDRVReplicationState *s;
>       BlockDriverState *top_bs;
> +    BdrvChild *active_disk, *hidden_disk, *secondary_disk;
>       int64_t active_length, hidden_length, disk_length;
>       AioContext *aio_context;
>       Error *local_err = NULL;
> @@ -488,32 +492,32 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
>       case REPLICATION_MODE_PRIMARY:
>           break;
>       case REPLICATION_MODE_SECONDARY:
> -        s->active_disk = bs->file;
> -        if (!s->active_disk || !s->active_disk->bs ||
> -                                    !s->active_disk->bs->backing) {
> +        active_disk = bs->file;
> +        if (!active_disk || !active_disk->bs ||
> +                                    !active_disk->bs->backing) {
>               error_setg(errp, "Active disk doesn't have backing file");
>               aio_context_release(aio_context);
>               return;
>           }
>   
> -        s->hidden_disk = s->active_disk->bs->backing;
> -        if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
> +        hidden_disk = active_disk->bs->backing;
> +        if (!hidden_disk->bs || !hidden_disk->bs->backing) {
>               error_setg(errp, "Hidden disk doesn't have backing file");
>               aio_context_release(aio_context);
>               return;
>           }
>   
> -        s->secondary_disk = s->hidden_disk->bs->backing;
> -        if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
> +        secondary_disk = hidden_disk->bs->backing;
> +        if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
>               error_setg(errp, "The secondary disk doesn't have block backend");
>               aio_context_release(aio_context);
>               return;
>           }
>   , probably it could be a separate patch if it is needed.
>           /* verify the length */
> -        active_length = bdrv_getlength(s->active_disk->bs);
> -        hidden_length = bdrv_getlength(s->hidden_disk->bs);
> -        disk_length = bdrv_getlength(s->secondary_disk->bs);
> +        active_length = bdrv_getlength(active_disk->bs);
> +        hidden_length = bdrv_getlength(hidden_disk->bs);
> +        disk_length = bdrv_getlength(secondary_disk->bs);
>           if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
>               active_length != hidden_length || hidden_length != disk_length) {
>               error_setg(errp, "Active disk, hidden disk, secondary disk's length"
> @@ -523,10 +527,10 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
>           }
>   
>           /* Must be true, or the bdrv_getlength() calls would have failed */
> -        assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
> +        assert(active_disk->bs->drv && hidden_disk->bs->drv);
>   
> -        if (!s->active_disk->bs->drv->bdrv_make_empty ||
> -            !s->hidden_disk->bs->drv->bdrv_make_empty) {
> +        if (!active_disk->bs->drv->bdrv_make_empty ||
> +            !hidden_disk->bs->drv->bdrv_make_empty) {
>               error_setg(errp,
>                          "Active disk or hidden disk doesn't support make_empty");
>               aio_context_release(aio_context);
> @@ -541,6 +545,28 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
>               return;
>           }
>   
> +        s->active_disk = active_disk;
> +
> +        bdrv_ref(hidden_disk->bs);
> +        s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
> +                                           &child_of_bds, BDRV_CHILD_DATA,
> +                                           &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            aio_context_release(aio_context);
> +            return;
> +        }

Ok, the point of creating hidden_disk is to call bdrv_make_empty on it.

> +
> +        bdrv_ref(secondary_disk->bs);
> +        s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs,
> +                                              "secondary disk", &child_of_bds,
> +                                              BDRV_CHILD_DATA, &local_err);
> +        if (local_err) {
> +            error_propagate(errp, local_err);
> +            aio_context_release(aio_context);
> +            return;
> +        }

But s->secondary_disk child is actually unused.. No reason to create it.

> +
>           /* start backup job now */
>           error_setg(&s->blocker,
>                      "Block device is in use by internal backup job");
> @@ -646,7 +672,9 @@ static void replication_done(void *opaque, int ret)
>           s->stage = BLOCK_REPLICATION_DONE;
>   
>           s->active_disk = NULL;
> +        bdrv_unref_child(bs, s->secondary_disk);
>           s->secondary_disk = NULL;
> +        bdrv_unref_child(bs, s->hidden_disk);
>           s->hidden_disk = NULL;
>           s->error = 0;
>       } else {
> 

For me it looks like the good way to update is:

1. drop s->active_disk. it seems to be just a copy of bs->file, better to use bs->file directly, like other drivers do.
2. reduce usage of s->hidden_disk and s->secondary_disk, like you do in this patch, using local variables instead. Also probably just drop s->secondary_disk..
3. introduce a child, to be used with bdrv_make_empty(s->hidden_disk)

And these are 3 separate patches. 1 and 2 may be merged, or instead 2 may be divided into two (to refactor secondary_disk and hidden_disk separately)..

Still, I'm not a maintainer of replication, neither I have good understanding of how it works, so don't take my advises to heart :)
Lukas Straub July 7, 2021, 2:53 p.m. UTC | #2
Hi,
Thanks for your review. More below.

Btw: There is a overview of the replication design in
docs/block-replication.txt

On Wed, 7 Jul 2021 16:01:31 +0300
Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> wrote:

> 06.07.2021 19:11, Lukas Straub wrote:
> > The replication driver needs access to the children block-nodes of
> > it's child so it can issue bdrv_make_empty to manage the replication.
> > However, it does this by directly copying the BdrvChilds, which is
> > wrong.
> > 
> > Fix this by properly attaching the block-nodes with
> > bdrv_attach_child().
> > 
> > Also, remove a workaround introduced in commit
> > 6ecbc6c52672db5c13805735ca02784879ce8285
> > "replication: Avoid blk_make_empty() on read-only child".
> > 
> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > ---
> > 
> > -v2: Test for BDRV_CHILD_PRIMARY in replication_child_perm, since
> >       bs->file might not be set yet. (Vladimir)
> > 
> >   block/replication.c | 94 +++++++++++++++++++++++++++++----------------
> >   1 file changed, 61 insertions(+), 33 deletions(-)
> > 
> > diff --git a/block/replication.c b/block/replication.c
> > index 52163f2d1f..fd8cb728a3 100644
> > --- a/block/replication.c
> > +++ b/block/replication.c
> > @@ -166,7 +166,12 @@ static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
> >                                      uint64_t perm, uint64_t shared,
> >                                      uint64_t *nperm, uint64_t *nshared)
> >   {
> > -    *nperm = BLK_PERM_CONSISTENT_READ;
> > +    if (role & BDRV_CHILD_PRIMARY) {
> > +        *nperm = BLK_PERM_CONSISTENT_READ;
> > +    } else {
> > +        *nperm = 0;
> > +    }  
> 
> Why you drop READ access for other children? You don't mention it in commit-msg..
> 
> Upd: ok now I see that we are not going to read from hidden_disk child, and that's the only "other" child that worth to mention.
> Still, we should be sure that hidden_disk child gets WRITE permission in case we are going to call bdrv_make_empty on it.

The code below that in replication_child_perm() should make sure of
that or am i misunderstanding it?

Or do you mean that it should always request WRITE regardless of
bs->open_flags & BDRV_O_INACTIVE?

> > +
> >       if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
> >           *nperm |= BLK_PERM_WRITE;
> >       }
> > @@ -340,17 +345,7 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
> >           return;
> >       }
> >   
> > -    BlockBackend *blk = blk_new(qemu_get_current_aio_context(),
> > -                                BLK_PERM_WRITE, BLK_PERM_ALL);
> > -    blk_insert_bs(blk, s->hidden_disk->bs, &local_err);
> > -    if (local_err) {
> > -        error_propagate(errp, local_err);
> > -        blk_unref(blk);
> > -        return;
> > -    }
> > -
> > -    ret = blk_make_empty(blk, errp);
> > -    blk_unref(blk);
> > +    ret = bdrv_make_empty(s->hidden_disk, errp);  
> 
> So, here you rely on BLK_PERM_WRITE being set in replication_child_perm().. Probably that's OK, however logic is changed. Shouldn't we always require write permission in replication_child_perm() for hidden_disk ?
> 
> >       if (ret < 0) {
> >           return;
> >       }
> > @@ -365,27 +360,35 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
> >                                   Error **errp)
> >   {
> >       BDRVReplicationState *s = bs->opaque;
> > +    BdrvChild *hidden_disk, *secondary_disk;
> >       BlockReopenQueue *reopen_queue = NULL;
> >   
> > +    /*
> > +     * s->hidden_disk and s->secondary_disk may not be set yet, as they will
> > +     * only be set after the children are writable.
> > +     */
> > +    hidden_disk = bs->file->bs->backing;
> > +    secondary_disk = hidden_disk->bs->backing;
> > +
> >       if (writable) {
> > -        s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
> > -        s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
> > +        s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs);
> > +        s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
> >       }
> >   
> > -    bdrv_subtree_drained_begin(s->hidden_disk->bs);
> > -    bdrv_subtree_drained_begin(s->secondary_disk->bs);
> > +    bdrv_subtree_drained_begin(hidden_disk->bs);
> > +    bdrv_subtree_drained_begin(secondary_disk->bs);  
> 
> That kind of staff may be done as a separate preparation patch, with no-logic-change refactoring, this makes the main logic-change patch simpler.

Yes, makes sense. Will do in the next version.

> >   
> >       if (s->orig_hidden_read_only) {
> >           QDict *opts = qdict_new();
> >           qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
> > -        reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
> > +        reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs,
> >                                            opts, true);
> >       }
> >   
> >       if (s->orig_secondary_read_only) {
> >           QDict *opts = qdict_new();
> >           qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
> > -        reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
> > +        reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs,
> >                                            opts, true);
> >       }, probably it could be a separate patch if it is needed.
> >   
> > @@ -393,8 +396,8 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
> >           bdrv_reopen_multiple(reopen_queue, errp);
> >       }
> >   
> > -    bdrv_subtree_drained_end(s->hidden_disk->bs);
> > -    bdrv_subtree_drained_end(s->secondary_disk->bs);
> > +    bdrv_subtree_drained_end(hidden_disk->bs);
> > +    bdrv_subtree_drained_end(secondary_disk->bs);
> >   }
> >   
> >   static void backup_job_cleanup(BlockDriverState *bs)
> > @@ -451,6 +454,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
> >       BlockDriverState *bs = rs->opaque;
> >       BDRVReplicationState *s;
> >       BlockDriverState *top_bs;
> > +    BdrvChild *active_disk, *hidden_disk, *secondary_disk;
> >       int64_t active_length, hidden_length, disk_length;
> >       AioContext *aio_context;
> >       Error *local_err = NULL;
> > @@ -488,32 +492,32 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
> >       case REPLICATION_MODE_PRIMARY:
> >           break;
> >       case REPLICATION_MODE_SECONDARY:
> > -        s->active_disk = bs->file;
> > -        if (!s->active_disk || !s->active_disk->bs ||
> > -                                    !s->active_disk->bs->backing) {
> > +        active_disk = bs->file;
> > +        if (!active_disk || !active_disk->bs ||
> > +                                    !active_disk->bs->backing) {
> >               error_setg(errp, "Active disk doesn't have backing file");
> >               aio_context_release(aio_context);
> >               return;
> >           }
> >   
> > -        s->hidden_disk = s->active_disk->bs->backing;
> > -        if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
> > +        hidden_disk = active_disk->bs->backing;
> > +        if (!hidden_disk->bs || !hidden_disk->bs->backing) {
> >               error_setg(errp, "Hidden disk doesn't have backing file");
> >               aio_context_release(aio_context);
> >               return;
> >           }
> >   
> > -        s->secondary_disk = s->hidden_disk->bs->backing;
> > -        if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
> > +        secondary_disk = hidden_disk->bs->backing;
> > +        if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
> >               error_setg(errp, "The secondary disk doesn't have block backend");
> >               aio_context_release(aio_context);
> >               return;
> >           }
> >   , probably it could be a separate patch if it is needed.

Ok.

> >           /* verify the length */
> > -        active_length = bdrv_getlength(s->active_disk->bs);
> > -        hidden_length = bdrv_getlength(s->hidden_disk->bs);
> > -        disk_length = bdrv_getlength(s->secondary_disk->bs);
> > +        active_length = bdrv_getlength(active_disk->bs);
> > +        hidden_length = bdrv_getlength(hidden_disk->bs);
> > +        disk_length = bdrv_getlength(secondary_disk->bs);
> >           if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
> >               active_length != hidden_length || hidden_length != disk_length) {
> >               error_setg(errp, "Active disk, hidden disk, secondary disk's length"
> > @@ -523,10 +527,10 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
> >           }
> >   
> >           /* Must be true, or the bdrv_getlength() calls would have failed */
> > -        assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
> > +        assert(active_disk->bs->drv && hidden_disk->bs->drv);
> >   
> > -        if (!s->active_disk->bs->drv->bdrv_make_empty ||
> > -            !s->hidden_disk->bs->drv->bdrv_make_empty) {
> > +        if (!active_disk->bs->drv->bdrv_make_empty ||
> > +            !hidden_disk->bs->drv->bdrv_make_empty) {
> >               error_setg(errp,
> >                          "Active disk or hidden disk doesn't support make_empty");
> >               aio_context_release(aio_context);
> > @@ -541,6 +545,28 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
> >               return;
> >           }
> >   
> > +        s->active_disk = active_disk;
> > +
> > +        bdrv_ref(hidden_disk->bs);
> > +        s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
> > +                                           &child_of_bds, BDRV_CHILD_DATA,
> > +                                           &local_err);
> > +        if (local_err) {
> > +            error_propagate(errp, local_err);
> > +            aio_context_release(aio_context);
> > +            return;
> > +        }  
> 
> Ok, the point of creating hidden_disk is to call bdrv_make_empty on it.
> 
> > +
> > +        bdrv_ref(secondary_disk->bs);
> > +        s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs,
> > +                                              "secondary disk", &child_of_bds,
> > +                                              BDRV_CHILD_DATA, &local_err);
> > +        if (local_err) {
> > +            error_propagate(errp, local_err);
> > +            aio_context_release(aio_context);
> > +            return;
> > +        }  
> 
> But s->secondary_disk child is actually unused.. No reason to create it.

It is used, look closely at replication_co_writev().

If the commit job (run during failover in replication_stop()) fails,
replication enters "failover failed" state. In that state it writes
directly to the secondary disk if possible (i.e. if the sector to write
is not already allocated on the active or hidden disk).

It does this so the active and hidden disks don't grow larger, since in
the COLO use-case they usually are put on a ramdisk with limited size.

> > +
> >           /* start backup job now */
> >           error_setg(&s->blocker,
> >                      "Block device is in use by internal backup job");
> > @@ -646,7 +672,9 @@ static void replication_done(void *opaque, int ret)
> >           s->stage = BLOCK_REPLICATION_DONE;
> >   
> >           s->active_disk = NULL;
> > +        bdrv_unref_child(bs, s->secondary_disk);
> >           s->secondary_disk = NULL;
> > +        bdrv_unref_child(bs, s->hidden_disk);
> >           s->hidden_disk = NULL;
> >           s->error = 0;
> >       } else {
> >   
> 
> For me it looks like the good way to update is:
> 
> 1. drop s->active_disk. it seems to be just a copy of bs->file, better to use bs->file directly, like other drivers do.
> 2. reduce usage of s->hidden_disk and s->secondary_disk, like you do in this patch, using local variables instead. Also probably just drop s->secondary_disk..
> 3. introduce a child, to be used with bdrv_make_empty(s->hidden_disk)
> 
> And these are 3 separate patches. 1 and 2 may be merged, or instead 2 may be divided into two (to refactor secondary_disk and hidden_disk separately)..

Sound good, will do.

> Still, I'm not a maintainer of replication, neither I have good understanding of how it works, so don't take my advises to heart :)
> 



--
Vladimir Sementsov-Ogievskiy July 7, 2021, 4:33 p.m. UTC | #3
07.07.2021 17:53, Lukas Straub wrote:
> Hi,
> Thanks for your review. More below.
> 
> Btw: There is a overview of the replication design in
> docs/block-replication.txt
> 
> On Wed, 7 Jul 2021 16:01:31 +0300
> Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> wrote:
> 
>> 06.07.2021 19:11, Lukas Straub wrote:
>>> The replication driver needs access to the children block-nodes of
>>> it's child so it can issue bdrv_make_empty to manage the replication.
>>> However, it does this by directly copying the BdrvChilds, which is
>>> wrong.
>>>
>>> Fix this by properly attaching the block-nodes with
>>> bdrv_attach_child().
>>>
>>> Also, remove a workaround introduced in commit
>>> 6ecbc6c52672db5c13805735ca02784879ce8285
>>> "replication: Avoid blk_make_empty() on read-only child".
>>>
>>> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
>>> ---
>>>
>>> -v2: Test for BDRV_CHILD_PRIMARY in replication_child_perm, since
>>>        bs->file might not be set yet. (Vladimir)
>>>
>>>    block/replication.c | 94 +++++++++++++++++++++++++++++----------------
>>>    1 file changed, 61 insertions(+), 33 deletions(-)
>>>
>>> diff --git a/block/replication.c b/block/replication.c
>>> index 52163f2d1f..fd8cb728a3 100644
>>> --- a/block/replication.c
>>> +++ b/block/replication.c
>>> @@ -166,7 +166,12 @@ static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
>>>                                       uint64_t perm, uint64_t shared,
>>>                                       uint64_t *nperm, uint64_t *nshared)
>>>    {
>>> -    *nperm = BLK_PERM_CONSISTENT_READ;
>>> +    if (role & BDRV_CHILD_PRIMARY) {
>>> +        *nperm = BLK_PERM_CONSISTENT_READ;
>>> +    } else {
>>> +        *nperm = 0;
>>> +    }
>>
>> Why you drop READ access for other children? You don't mention it in commit-msg..
>>
>> Upd: ok now I see that we are not going to read from hidden_disk child, and that's the only "other" child that worth to mention.
>> Still, we should be sure that hidden_disk child gets WRITE permission in case we are going to call bdrv_make_empty on it.
> 
> The code below that in replication_child_perm() should make sure of
> that or am i misunderstanding it?
> 
> Or do you mean that it should always request WRITE regardless of
> bs->open_flags & BDRV_O_INACTIVE?

Probably that.

I mean the following:

Do you know the circumstances when secondary_do_checkpoint() is called? With new code, could it be called when we don't have WRITE permission on hidden_disk? If it could, it's a bug.

And this patch should answer this question, because pre-patch we create blk with explicitly set BLK_PERM_WRITE. After-patch we rely on existing code in replication_child_perm().

> 
>>> +
>>>        if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
>>>            *nperm |= BLK_PERM_WRITE;
>>>        }
>>> @@ -340,17 +345,7 @@ static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
>>>            return;
>>>        }
>>>    
>>> -    BlockBackend *blk = blk_new(qemu_get_current_aio_context(),
>>> -                                BLK_PERM_WRITE, BLK_PERM_ALL);
>>> -    blk_insert_bs(blk, s->hidden_disk->bs, &local_err);
>>> -    if (local_err) {
>>> -        error_propagate(errp, local_err);
>>> -        blk_unref(blk);
>>> -        return;
>>> -    }
>>> -
>>> -    ret = blk_make_empty(blk, errp);
>>> -    blk_unref(blk);
>>> +    ret = bdrv_make_empty(s->hidden_disk, errp);
>>
>> So, here you rely on BLK_PERM_WRITE being set in replication_child_perm().. Probably that's OK, however logic is changed. Shouldn't we always require write permission in replication_child_perm() for hidden_disk ?
>>
>>>        if (ret < 0) {
>>>            return;
>>>        }
>>> @@ -365,27 +360,35 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
>>>                                    Error **errp)
>>>    {
>>>        BDRVReplicationState *s = bs->opaque;
>>> +    BdrvChild *hidden_disk, *secondary_disk;
>>>        BlockReopenQueue *reopen_queue = NULL;
>>>    
>>> +    /*
>>> +     * s->hidden_disk and s->secondary_disk may not be set yet, as they will
>>> +     * only be set after the children are writable.
>>> +     */
>>> +    hidden_disk = bs->file->bs->backing;
>>> +    secondary_disk = hidden_disk->bs->backing;
>>> +
>>>        if (writable) {
>>> -        s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
>>> -        s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
>>> +        s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs);
>>> +        s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
>>>        }
>>>    
>>> -    bdrv_subtree_drained_begin(s->hidden_disk->bs);
>>> -    bdrv_subtree_drained_begin(s->secondary_disk->bs);
>>> +    bdrv_subtree_drained_begin(hidden_disk->bs);
>>> +    bdrv_subtree_drained_begin(secondary_disk->bs);
>>
>> That kind of staff may be done as a separate preparation patch, with no-logic-change refactoring, this makes the main logic-change patch simpler.
> 
> Yes, makes sense. Will do in the next version.
> 
>>>    
>>>        if (s->orig_hidden_read_only) {
>>>            QDict *opts = qdict_new();
>>>            qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
>>> -        reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
>>> +        reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs,
>>>                                             opts, true);
>>>        }
>>>    
>>>        if (s->orig_secondary_read_only) {
>>>            QDict *opts = qdict_new();
>>>            qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
>>> -        reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
>>> +        reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs,
>>>                                             opts, true);
>>>        }, probably it could be a separate patch if it is needed.
>>>    
>>> @@ -393,8 +396,8 @@ static void reopen_backing_file(BlockDriverState *bs, bool writable,
>>>            bdrv_reopen_multiple(reopen_queue, errp);
>>>        }
>>>    
>>> -    bdrv_subtree_drained_end(s->hidden_disk->bs);
>>> -    bdrv_subtree_drained_end(s->secondary_disk->bs);
>>> +    bdrv_subtree_drained_end(hidden_disk->bs);
>>> +    bdrv_subtree_drained_end(secondary_disk->bs);
>>>    }
>>>    
>>>    static void backup_job_cleanup(BlockDriverState *bs)
>>> @@ -451,6 +454,7 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
>>>        BlockDriverState *bs = rs->opaque;
>>>        BDRVReplicationState *s;
>>>        BlockDriverState *top_bs;
>>> +    BdrvChild *active_disk, *hidden_disk, *secondary_disk;
>>>        int64_t active_length, hidden_length, disk_length;
>>>        AioContext *aio_context;
>>>        Error *local_err = NULL;
>>> @@ -488,32 +492,32 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
>>>        case REPLICATION_MODE_PRIMARY:
>>>            break;
>>>        case REPLICATION_MODE_SECONDARY:
>>> -        s->active_disk = bs->file;
>>> -        if (!s->active_disk || !s->active_disk->bs ||
>>> -                                    !s->active_disk->bs->backing) {
>>> +        active_disk = bs->file;
>>> +        if (!active_disk || !active_disk->bs ||
>>> +                                    !active_disk->bs->backing) {
>>>                error_setg(errp, "Active disk doesn't have backing file");
>>>                aio_context_release(aio_context);
>>>                return;
>>>            }
>>>    
>>> -        s->hidden_disk = s->active_disk->bs->backing;
>>> -        if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
>>> +        hidden_disk = active_disk->bs->backing;
>>> +        if (!hidden_disk->bs || !hidden_disk->bs->backing) {
>>>                error_setg(errp, "Hidden disk doesn't have backing file");
>>>                aio_context_release(aio_context);
>>>                return;
>>>            }
>>>    
>>> -        s->secondary_disk = s->hidden_disk->bs->backing;
>>> -        if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
>>> +        secondary_disk = hidden_disk->bs->backing;
>>> +        if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
>>>                error_setg(errp, "The secondary disk doesn't have block backend");
>>>                aio_context_release(aio_context);
>>>                return;
>>>            }
>>>    , probably it could be a separate patch if it is needed.
> 
> Ok.
> 
>>>            /* verify the length */
>>> -        active_length = bdrv_getlength(s->active_disk->bs);
>>> -        hidden_length = bdrv_getlength(s->hidden_disk->bs);
>>> -        disk_length = bdrv_getlength(s->secondary_disk->bs);
>>> +        active_length = bdrv_getlength(active_disk->bs);
>>> +        hidden_length = bdrv_getlength(hidden_disk->bs);
>>> +        disk_length = bdrv_getlength(secondary_disk->bs);
>>>            if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
>>>                active_length != hidden_length || hidden_length != disk_length) {
>>>                error_setg(errp, "Active disk, hidden disk, secondary disk's length"
>>> @@ -523,10 +527,10 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
>>>            }
>>>    
>>>            /* Must be true, or the bdrv_getlength() calls would have failed */
>>> -        assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
>>> +        assert(active_disk->bs->drv && hidden_disk->bs->drv);
>>>    
>>> -        if (!s->active_disk->bs->drv->bdrv_make_empty ||
>>> -            !s->hidden_disk->bs->drv->bdrv_make_empty) {
>>> +        if (!active_disk->bs->drv->bdrv_make_empty ||
>>> +            !hidden_disk->bs->drv->bdrv_make_empty) {
>>>                error_setg(errp,
>>>                           "Active disk or hidden disk doesn't support make_empty");
>>>                aio_context_release(aio_context);
>>> @@ -541,6 +545,28 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
>>>                return;
>>>            }
>>>    
>>> +        s->active_disk = active_disk;
>>> +
>>> +        bdrv_ref(hidden_disk->bs);
>>> +        s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
>>> +                                           &child_of_bds, BDRV_CHILD_DATA,
>>> +                                           &local_err);
>>> +        if (local_err) {
>>> +            error_propagate(errp, local_err);
>>> +            aio_context_release(aio_context);
>>> +            return;
>>> +        }
>>
>> Ok, the point of creating hidden_disk is to call bdrv_make_empty on it.
>>
>>> +
>>> +        bdrv_ref(secondary_disk->bs);
>>> +        s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs,
>>> +                                              "secondary disk", &child_of_bds,
>>> +                                              BDRV_CHILD_DATA, &local_err);
>>> +        if (local_err) {
>>> +            error_propagate(errp, local_err);
>>> +            aio_context_release(aio_context);
>>> +            return;
>>> +        }
>>
>> But s->secondary_disk child is actually unused.. No reason to create it.
> 
> It is used, look closely at replication_co_writev().

Ah right, sorry. "base = s->secondary_disk" and next works with base. I see that now.

> 
> If the commit job (run during failover in replication_stop()) fails,
> replication enters "failover failed" state. In that state it writes
> directly to the secondary disk if possible (i.e. if the sector to write
> is not already allocated on the active or hidden disk).
> 
> It does this so the active and hidden disks don't grow larger, since in
> the COLO use-case they usually are put on a ramdisk with limited size.
> 
>>> +
>>>            /* start backup job now */
>>>            error_setg(&s->blocker,
>>>                       "Block device is in use by internal backup job");
>>> @@ -646,7 +672,9 @@ static void replication_done(void *opaque, int ret)
>>>            s->stage = BLOCK_REPLICATION_DONE;
>>>    
>>>            s->active_disk = NULL;
>>> +        bdrv_unref_child(bs, s->secondary_disk);
>>>            s->secondary_disk = NULL;
>>> +        bdrv_unref_child(bs, s->hidden_disk);
>>>            s->hidden_disk = NULL;
>>>            s->error = 0;
>>>        } else {
>>>    
>>
>> For me it looks like the good way to update is:
>>
>> 1. drop s->active_disk. it seems to be just a copy of bs->file, better to use bs->file directly, like other drivers do.
>> 2. reduce usage of s->hidden_disk and s->secondary_disk, like you do in this patch, using local variables instead. Also probably just drop s->secondary_disk..
>> 3. introduce a child, to be used with bdrv_make_empty(s->hidden_disk)
>>
>> And these are 3 separate patches. 1 and 2 may be merged, or instead 2 may be divided into two (to refactor secondary_disk and hidden_disk separately)..
> 
> Sound good, will do.
> 
>> Still, I'm not a maintainer of replication, neither I have good understanding of how it works, so don't take my advises to heart :)
>>
> 
> 
>
diff mbox series

Patch

diff --git a/block/replication.c b/block/replication.c
index 52163f2d1f..fd8cb728a3 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -166,7 +166,12 @@  static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
                                    uint64_t perm, uint64_t shared,
                                    uint64_t *nperm, uint64_t *nshared)
 {
-    *nperm = BLK_PERM_CONSISTENT_READ;
+    if (role & BDRV_CHILD_PRIMARY) {
+        *nperm = BLK_PERM_CONSISTENT_READ;
+    } else {
+        *nperm = 0;
+    }
+
     if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
         *nperm |= BLK_PERM_WRITE;
     }
@@ -340,17 +345,7 @@  static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
         return;
     }
 
-    BlockBackend *blk = blk_new(qemu_get_current_aio_context(),
-                                BLK_PERM_WRITE, BLK_PERM_ALL);
-    blk_insert_bs(blk, s->hidden_disk->bs, &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        blk_unref(blk);
-        return;
-    }
-
-    ret = blk_make_empty(blk, errp);
-    blk_unref(blk);
+    ret = bdrv_make_empty(s->hidden_disk, errp);
     if (ret < 0) {
         return;
     }
@@ -365,27 +360,35 @@  static void reopen_backing_file(BlockDriverState *bs, bool writable,
                                 Error **errp)
 {
     BDRVReplicationState *s = bs->opaque;
+    BdrvChild *hidden_disk, *secondary_disk;
     BlockReopenQueue *reopen_queue = NULL;
 
+    /*
+     * s->hidden_disk and s->secondary_disk may not be set yet, as they will
+     * only be set after the children are writable.
+     */
+    hidden_disk = bs->file->bs->backing;
+    secondary_disk = hidden_disk->bs->backing;
+
     if (writable) {
-        s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
-        s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
+        s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs);
+        s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
     }
 
-    bdrv_subtree_drained_begin(s->hidden_disk->bs);
-    bdrv_subtree_drained_begin(s->secondary_disk->bs);
+    bdrv_subtree_drained_begin(hidden_disk->bs);
+    bdrv_subtree_drained_begin(secondary_disk->bs);
 
     if (s->orig_hidden_read_only) {
         QDict *opts = qdict_new();
         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
-        reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
+        reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs,
                                          opts, true);
     }
 
     if (s->orig_secondary_read_only) {
         QDict *opts = qdict_new();
         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
-        reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
+        reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs,
                                          opts, true);
     }
 
@@ -393,8 +396,8 @@  static void reopen_backing_file(BlockDriverState *bs, bool writable,
         bdrv_reopen_multiple(reopen_queue, errp);
     }
 
-    bdrv_subtree_drained_end(s->hidden_disk->bs);
-    bdrv_subtree_drained_end(s->secondary_disk->bs);
+    bdrv_subtree_drained_end(hidden_disk->bs);
+    bdrv_subtree_drained_end(secondary_disk->bs);
 }
 
 static void backup_job_cleanup(BlockDriverState *bs)
@@ -451,6 +454,7 @@  static void replication_start(ReplicationState *rs, ReplicationMode mode,
     BlockDriverState *bs = rs->opaque;
     BDRVReplicationState *s;
     BlockDriverState *top_bs;
+    BdrvChild *active_disk, *hidden_disk, *secondary_disk;
     int64_t active_length, hidden_length, disk_length;
     AioContext *aio_context;
     Error *local_err = NULL;
@@ -488,32 +492,32 @@  static void replication_start(ReplicationState *rs, ReplicationMode mode,
     case REPLICATION_MODE_PRIMARY:
         break;
     case REPLICATION_MODE_SECONDARY:
-        s->active_disk = bs->file;
-        if (!s->active_disk || !s->active_disk->bs ||
-                                    !s->active_disk->bs->backing) {
+        active_disk = bs->file;
+        if (!active_disk || !active_disk->bs ||
+                                    !active_disk->bs->backing) {
             error_setg(errp, "Active disk doesn't have backing file");
             aio_context_release(aio_context);
             return;
         }
 
-        s->hidden_disk = s->active_disk->bs->backing;
-        if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
+        hidden_disk = active_disk->bs->backing;
+        if (!hidden_disk->bs || !hidden_disk->bs->backing) {
             error_setg(errp, "Hidden disk doesn't have backing file");
             aio_context_release(aio_context);
             return;
         }
 
-        s->secondary_disk = s->hidden_disk->bs->backing;
-        if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
+        secondary_disk = hidden_disk->bs->backing;
+        if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
             error_setg(errp, "The secondary disk doesn't have block backend");
             aio_context_release(aio_context);
             return;
         }
 
         /* verify the length */
-        active_length = bdrv_getlength(s->active_disk->bs);
-        hidden_length = bdrv_getlength(s->hidden_disk->bs);
-        disk_length = bdrv_getlength(s->secondary_disk->bs);
+        active_length = bdrv_getlength(active_disk->bs);
+        hidden_length = bdrv_getlength(hidden_disk->bs);
+        disk_length = bdrv_getlength(secondary_disk->bs);
         if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
             active_length != hidden_length || hidden_length != disk_length) {
             error_setg(errp, "Active disk, hidden disk, secondary disk's length"
@@ -523,10 +527,10 @@  static void replication_start(ReplicationState *rs, ReplicationMode mode,
         }
 
         /* Must be true, or the bdrv_getlength() calls would have failed */
-        assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
+        assert(active_disk->bs->drv && hidden_disk->bs->drv);
 
-        if (!s->active_disk->bs->drv->bdrv_make_empty ||
-            !s->hidden_disk->bs->drv->bdrv_make_empty) {
+        if (!active_disk->bs->drv->bdrv_make_empty ||
+            !hidden_disk->bs->drv->bdrv_make_empty) {
             error_setg(errp,
                        "Active disk or hidden disk doesn't support make_empty");
             aio_context_release(aio_context);
@@ -541,6 +545,28 @@  static void replication_start(ReplicationState *rs, ReplicationMode mode,
             return;
         }
 
+        s->active_disk = active_disk;
+
+        bdrv_ref(hidden_disk->bs);
+        s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
+                                           &child_of_bds, BDRV_CHILD_DATA,
+                                           &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            aio_context_release(aio_context);
+            return;
+        }
+
+        bdrv_ref(secondary_disk->bs);
+        s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs,
+                                              "secondary disk", &child_of_bds,
+                                              BDRV_CHILD_DATA, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            aio_context_release(aio_context);
+            return;
+        }
+
         /* start backup job now */
         error_setg(&s->blocker,
                    "Block device is in use by internal backup job");
@@ -646,7 +672,9 @@  static void replication_done(void *opaque, int ret)
         s->stage = BLOCK_REPLICATION_DONE;
 
         s->active_disk = NULL;
+        bdrv_unref_child(bs, s->secondary_disk);
         s->secondary_disk = NULL;
+        bdrv_unref_child(bs, s->hidden_disk);
         s->hidden_disk = NULL;
         s->error = 0;
     } else {