diff mbox

[v2,2/2] block: Make op blockers recursive

Message ID 1411412452-8655-3-git-send-email-benoit.canet@nodalink.com
State New
Headers show

Commit Message

Benoît Canet Sept. 22, 2014, 7 p.m. UTC
Since the block layer code is starting to modify the BDS graph right in the
middle of BDS chains (block-mirror's replace parameter for example) QEMU needs
to properly block and unblock whole BDS subtrees; recursion is a neat way to
achieve this task.

An optional base arguments was added to the API to optionally allow blocking
only a part of a BDS chain.

This patch also takes care of modifying the op blockers users.

Signed-off-by: Benoit Canet <benoit.canet@nodalink.com>
---
 block-migration.c         |   4 +-
 block.c                   | 121 ++++++++++++++++++++++++++++++++++++++++++----
 block/blkverify.c         |  21 ++++++++
 block/commit.c            |   2 +
 block/mirror.c            |  26 +++++++---
 block/quorum.c            |  29 +++++++++++
 block/stream.c            |   2 +
 block/vmdk.c              |  32 ++++++++++++
 blockjob.c                |   6 +--
 include/block/block.h     |  12 +++--
 include/block/block_int.h |  10 ++++
 11 files changed, 240 insertions(+), 25 deletions(-)

Comments

Jeff Cody Oct. 1, 2014, 4:38 a.m. UTC | #1
On Mon, Sep 22, 2014 at 09:00:52PM +0200, Benoît Canet wrote:
> Since the block layer code is starting to modify the BDS graph right in the
> middle of BDS chains (block-mirror's replace parameter for example) QEMU needs
> to properly block and unblock whole BDS subtrees; recursion is a neat way to
> achieve this task.
> 
> An optional base arguments was added to the API to optionally allow blocking
> only a part of a BDS chain.
> 
> This patch also takes care of modifying the op blockers users.
> 
> Signed-off-by: Benoit Canet <benoit.canet@nodalink.com>
> ---
>  block-migration.c         |   4 +-
>  block.c                   | 121 ++++++++++++++++++++++++++++++++++++++++++----
>  block/blkverify.c         |  21 ++++++++
>  block/commit.c            |   2 +
>  block/mirror.c            |  26 +++++++---
>  block/quorum.c            |  29 +++++++++++
>  block/stream.c            |   2 +
>  block/vmdk.c              |  32 ++++++++++++
>  blockjob.c                |   6 +--
>  include/block/block.h     |  12 +++--
>  include/block/block_int.h |  10 ++++
>  11 files changed, 240 insertions(+), 25 deletions(-)
> 

Ideally, I think this patch should be split into 2 patches:

1) Add recursion to bdrv_op_unblock/block
2) Implement .bdrv_op_resursive_block/unblock in the format drivers

But that is just a suggestion, nothing else.  Other people's
preferences may vary.

> diff --git a/block-migration.c b/block-migration.c
> index 3ad31a2..ad7aa03 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -362,7 +362,7 @@ static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
>          bmds->shared_base = block_mig_state.shared_base;
>          alloc_aio_bitmap(bmds);
>          error_setg(&bmds->blocker, "block device is in use by migration");
> -        bdrv_op_block_all(bs, bmds->blocker);
> +        bdrv_op_block_all(bs, NULL, bmds->blocker);
>          bdrv_ref(bs);
>  
>          block_mig_state.total_sector_sum += sectors;
> @@ -600,7 +600,7 @@ static void blk_mig_cleanup(void)
>      blk_mig_lock();
>      while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
>          QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
> -        bdrv_op_unblock_all(bmds->bs, bmds->blocker);
> +        bdrv_op_unblock_all(bmds->bs, NULL, bmds->blocker);
>          error_free(bmds->blocker);
>          bdrv_unref(bmds->bs);
>          g_free(bmds->aio_bitmap);
> diff --git a/block.c b/block.c
> index 9a9f8a0..b2d6953 100644
> --- a/block.c
> +++ b/block.c
> @@ -1148,7 +1148,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
>  
>      if (bs->backing_hd) {
>          assert(bs->backing_blocker);
> -        bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
> +        bdrv_op_unblock_all(bs->backing_hd, NULL, bs->backing_blocker);
>      } else if (backing_hd) {
>          error_setg(&bs->backing_blocker,
>                     "device is used as backing hd of '%s'",
> @@ -1166,9 +1166,9 @@ void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
>      pstrcpy(bs->backing_format, sizeof(bs->backing_format),
>              backing_hd->drv ? backing_hd->drv->format_name : "");
>  
> -    bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
> +    bdrv_op_block_all(bs->backing_hd, NULL, bs->backing_blocker);
>      /* Otherwise we won't be able to commit due to check in bdrv_commit */
> -    bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
> +    bdrv_op_unblock(bs->backing_hd, NULL, BLOCK_OP_TYPE_COMMIT,
>                      bs->backing_blocker);
>  out:
>      bdrv_refresh_limits(bs, NULL);
> @@ -5482,7 +5482,9 @@ bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
>      return false;
>  }
>  
> -void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
> +/* do the real work of blocking a BDS */
> +static void bdrv_do_op_block(BlockDriverState *bs, BlockOpType op,
> +                             Error *reason)
>  {
>      BdrvOpBlocker *blocker;
>      assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
> @@ -5492,7 +5494,9 @@ void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
>      QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
>  }
>  
> -void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
> +/* do the real work of unblocking a BDS */
> +static void bdrv_do_op_unblock(BlockDriverState *bs, BlockOpType op,
> +                               Error *reason)
>  {
>      BdrvOpBlocker *blocker, *next;
>      assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
> @@ -5504,19 +5508,118 @@ void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
>      }
>  }
>  
> -void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
> +static bool bdrv_op_is_blocked_by(BlockDriverState *bs, BlockOpType op,
> +                                  Error *reason)
> +{
> +    BdrvOpBlocker *blocker;
> +    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
> +    QLIST_FOREACH(blocker, &bs->op_blockers[op], list) {
> +        if (blocker->reason == reason) {
> +            return true;
> +        }
> +    }
> +    return false;
> +}
> +
> +/* block recursively a BDS
> + *
> + * If base != NULL the caller must make sure that there is no multiple child
> + * BDS in the subtree pointed to by bs

In the case when base != NULL, should that really matter?  In a
driver's .bdrv_op_recursive_block/unblock, if that driver has private
children (multiple or not), shouldn't the private children be treated
as one black box, and blocked / unblocked just like the parent
BDS?

For instance, what if we had a tree such as this:

                       [quorum1] <---- [active]
                           |
                           | (private)
                           |
                           v
[node2]<-- [node1] <--- [image1]


if 'quorum1' was to be op blocked, and 'image1' and its children all
comprise 'quorum1', wouldn't we always want to lock 'image1' all the
way down to 'node2'?

Or do we expect that someone will intentionally pass a 'base' pointer
along that matches somewhere in the private child tree?

> + *
> + * @bs:     the BDS to start to recurse from
> + * @base:   the BDS where the recursion should end
> + *          could be NULL if the recursion should go down everywhere
> + * @op:     the type of op blocker to block
> + * @reason: the error message associated with the blocking
> + */
> +void bdrv_op_block(BlockDriverState *bs, BlockDriverState *base,
> +                   BlockOpType op, Error *reason)
> +{
> +    if (!bs) {
> +        return;
> +    }
> +
> +    /* did we recurse down to base ? */
> +    if (bs == base) {
> +        return;
> +    }
> +
> +    /* prevent recursion loop */
> +    if (bdrv_op_is_blocked_by(bs, op, reason)) {
> +        return;
> +    }

Should we handle this somehow (isn't this effectively an error, that
will prematurely end the blocking of this particular line)? 

If we hit this, we are going to end up in a scenario where we haven't
blocked the chain as requested, and we don't know the state of the
blocking below this failure point.  Seems like the caller should know,
and we should have a way of cleaning up.

Actually, now I am wondering if we perhaps shouldn't be building a
list to block, and then blocking everything atomically once we know
there are no failure points.

> +
> +    /* block first for recursion loop protection to work */
> +    bdrv_do_op_block(bs, op, reason);
> +
> +    bdrv_op_block(bs->file, base, op, reason);
> +
> +    if (bs->drv && bs->drv->supports_backing) {
> +        bdrv_op_block(bs->backing_hd, base, op, reason);
> +    }
> +
> +    if (bs->drv && bs->drv->bdrv_op_recursive_block) {
> +        bs->drv->bdrv_op_recursive_block(bs, base, op, reason);

Do we need to allow .bdrv_op_recursive_block() to fail and return
error (and handle it, of course)?


> +    }
> +}
> +
> +/* unblock recursively a BDS
> + *
> + * If base != NULL the caller must make sure that there is no multiple child
> + * BDS in the subtree pointed to by bs
> + *
> + * @bs:     the BDS to start to recurse from
> + * @base:   the BDS where the recursion should end
> + *          could be NULL if the recursion should go down everywhere
> + * @op:     the type of op blocker to block

s/block/unblock


> + * @reason: the error message associated with the blocking
> + */
> +void bdrv_op_unblock(BlockDriverState *bs, BlockDriverState *base,
> +                     BlockOpType op, Error *reason)
> +{
> +    if (!bs) {
> +        return;
> +    }
> +
> +    /* did we recurse down to base ? */
> +    if (bs == base) {
> +        return;
> +    }
> +
> +    /* prevent recursion loop */
> +    if (!bdrv_op_is_blocked_by(bs, op, reason)) {
> +        return;
> +    }

Do we want to do this negative check here?  Even if a given node is
not blocked, its children may be, and I would assume (since this is
recursive) that if I called bdrv_op_unblock() all of the chain down to
'base' would be unblocked for the given reason.

E.g. 

   [image1] <-- [image2] <-- [image3]
   blocked      unblocked     blocked

I would assume that bdrv_op_unblock(image2, NULL, reason) would still
unblock image1, even if image2 was unblocked.

> +
> +    /* block first for recursion loop protection to work */
> +    bdrv_do_op_unblock(bs, op, reason);
> +
> +    bdrv_op_unblock(bs->file, base, op, reason);
> +
> +    if (bs->drv && bs->drv->supports_backing) {
> +        bdrv_op_unblock(bs->backing_hd, base, op, reason);
> +    }
> +
> +    if (bs->drv && bs->drv->bdrv_op_recursive_block) {
> +        bs->drv->bdrv_op_recursive_unblock(bs, base, op, reason);
> +    }
> +}
> +
> +void bdrv_op_block_all(BlockDriverState *bs, BlockDriverState *base,
> +                       Error *reason)
>  {
>      int i;
>      for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
> -        bdrv_op_block(bs, i, reason);
> +        bdrv_op_block(bs, base, i, reason);
>      }
>  }
>  
> -void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
> +void bdrv_op_unblock_all(BlockDriverState *bs, BlockDriverState *base,
> +                         Error *reason)
>  {
>      int i;
>      for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
> -        bdrv_op_unblock(bs, i, reason);
> +        bdrv_op_unblock(bs, base, i, reason);
>      }
>  }
>  
> diff --git a/block/blkverify.c b/block/blkverify.c
> index 163064c..5a4fa7c 100644
> --- a/block/blkverify.c
> +++ b/block/blkverify.c
> @@ -349,6 +349,24 @@ static void blkverify_refresh_filename(BlockDriverState *bs)
>      }
>  }
>  
> +static void blkverify_op_recursive_block(BlockDriverState *bs,
> +                                         BlockDriverState *base,
> +                                         BlockOpType op,
> +                                         Error *reason)
> +{
> +    BDRVBlkverifyState *s = bs->opaque;
> +    bdrv_op_block(s->test_file, base, op, reason);
> +}
> +
> +static void blkverify_op_recursive_unblock(BlockDriverState *bs,
> +                                           BlockDriverState *base,
> +                                           BlockOpType op,
> +                                           Error *reason)
> +{
> +    BDRVBlkverifyState *s = bs->opaque;
> +    bdrv_op_unblock(s->test_file, base, op, reason);
> +}
> +
>  static BlockDriver bdrv_blkverify = {
>      .format_name                      = "blkverify",
>      .protocol_name                    = "blkverify",
> @@ -367,6 +385,9 @@ static BlockDriver bdrv_blkverify = {
>      .bdrv_attach_aio_context          = blkverify_attach_aio_context,
>      .bdrv_detach_aio_context          = blkverify_detach_aio_context,
>  
> +    .bdrv_op_recursive_block          = blkverify_op_recursive_block,
> +    .bdrv_op_recursive_unblock        = blkverify_op_recursive_unblock,
> +
>      .is_filter                        = true,
>      .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
>  };
> diff --git a/block/commit.c b/block/commit.c
> index 91517d3..6a514d4 100644
> --- a/block/commit.c
> +++ b/block/commit.c
> @@ -142,6 +142,8 @@ wait:
>  
>      if (!block_job_is_cancelled(&s->common) && sector_num == end) {
>          /* success */
> +        /* unblock only BDS to be dropped */
> +        bdrv_op_unblock_all(top, base, s->common.blocker);
>          ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
>      }
>  
> diff --git a/block/mirror.c b/block/mirror.c
> index 18b18e0..6f6071a 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -322,6 +322,7 @@ static void coroutine_fn mirror_run(void *opaque)
>      uint64_t last_pause_ns;
>      BlockDriverInfo bdi;
>      char backing_filename[1024];
> +    BlockDriverState *to_replace;
>      int ret = 0;
>      int n;
>  
> @@ -510,14 +511,16 @@ immediate_exit:
>      g_free(s->in_flight_bitmap);
>      bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
>      bdrv_iostatus_disable(s->target);
> +    to_replace = s->common.bs;
> +    if (s->to_replace) {
> +        bdrv_op_unblock_all(s->to_replace, NULL, s->replace_blocker);
> +        to_replace = s->to_replace;
> +    }
>      if (s->should_complete && ret == 0) {
> -        BlockDriverState *to_replace = s->common.bs;
> -        if (s->to_replace) {
> -            to_replace = s->to_replace;
> -        }
>          if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
>              bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
>          }
> +        bdrv_op_unblock_all(to_replace, NULL, bs->job->blocker);
>          bdrv_swap(s->target, to_replace);
>          if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
>              /* drop the bs loop chain formed by the swap: break the loop then
> @@ -528,7 +531,6 @@ immediate_exit:
>          }
>      }
>      if (s->to_replace) {
> -        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
>          error_free(s->replace_blocker);
>          bdrv_unref(s->to_replace);
>      }
> @@ -581,7 +583,7 @@ static void mirror_complete(BlockJob *job, Error **errp)
>  
>          error_setg(&s->replace_blocker,
>                     "block device is in use by block-job-complete");
> -        bdrv_op_block_all(s->to_replace, s->replace_blocker);
> +        bdrv_op_block_all(s->to_replace, NULL, s->replace_blocker);
>          bdrv_ref(s->to_replace);
>      }
>  
> @@ -640,12 +642,22 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
>          return;
>      }
>  
> -
>      s = block_job_create(driver, bs, speed, cb, opaque, errp);
>      if (!s) {
>          return;
>      }
>  
> +    /* block_job_create block all block job operations.
> +     * If replaces is specified the block-job-complete code will have to block
> +     * BLOCK_OP_TYPE_MIRROR_REPLACE during the time the mirror complete.
> +     * Conditionally unblock BLOCK_OP_TYPE_MIRROR_REPLACE so the
> +     * block-job-complete can do its work.
> +     */
> +    if (replaces) {
> +        bdrv_op_unblock(bs, NULL, BLOCK_OP_TYPE_MIRROR_REPLACE,
> +                        bs->job->blocker);
> +    }
> +
>      s->replaces = g_strdup(replaces);
>      s->on_source_error = on_source_error;
>      s->on_target_error = on_target_error;
> diff --git a/block/quorum.c b/block/quorum.c
> index 093382e..7b06911 100644
> --- a/block/quorum.c
> +++ b/block/quorum.c
> @@ -1065,6 +1065,32 @@ static void quorum_refresh_filename(BlockDriverState *bs)
>      bs->full_open_options = opts;
>  }
>  
> +static void quorum_op_recursive_block(BlockDriverState *bs,
> +                                      BlockDriverState *base,
> +                                      BlockOpType op,
> +                                      Error *reason)
> +{
> +    BDRVQuorumState *s = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < s->num_children; i++) {
> +        bdrv_op_block(s->bs[i], base, op, reason);
> +    }

I am wondering if the 'base' argument above should always be NULL in
the case of private children.  I.e., the state of private children
trees in their entirety should always reflect the state of the
multi-children BDS parent.

> +}
> +
> +static void quorum_op_recursive_unblock(BlockDriverState *bs,
> +                                        BlockDriverState *base,
> +                                        BlockOpType op,
> +                                        Error *reason)
> +{
> +    BDRVQuorumState *s = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < s->num_children; i++) {
> +        bdrv_op_unblock(s->bs[i], base, op, reason);
> +    }
> +}
> +
>  static BlockDriver bdrv_quorum = {
>      .format_name                        = "quorum",
>      .protocol_name                      = "quorum",
> @@ -1086,6 +1112,9 @@ static BlockDriver bdrv_quorum = {
>      .bdrv_detach_aio_context            = quorum_detach_aio_context,
>      .bdrv_attach_aio_context            = quorum_attach_aio_context,
>  
> +    .bdrv_op_recursive_block            = quorum_op_recursive_block,
> +    .bdrv_op_recursive_unblock          = quorum_op_recursive_unblock,
> +
>      .is_filter                          = true,
>      .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
>  };
> diff --git a/block/stream.c b/block/stream.c
> index cdea3e8..a9e69a5 100644
> --- a/block/stream.c
> +++ b/block/stream.c
> @@ -192,6 +192,8 @@ wait:
>              }
>          }
>          ret = bdrv_change_backing_file(bs, base_id, base_fmt);
> +        /* unblock only BDS to be dropped */
> +        bdrv_op_unblock_all(bs->backing_hd, base, s->common.blocker);
>          close_unused_images(bs, base, base_id);
>      }
>  
> diff --git a/block/vmdk.c b/block/vmdk.c
> index afdea1a..257f683 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -2221,6 +2221,36 @@ static QemuOptsList vmdk_create_opts = {
>      }
>  };
>  
> +static void vmdk_op_recursive_block(BlockDriverState *bs,
> +                                    BlockDriverState *base,
> +                                    BlockOpType op,
> +                                    Error *reason)
> +{
> +    BDRVVmdkState *s = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < s->num_extents; i++) {
> +        if (s->extents[i].file) {
> +            bdrv_op_block(s->extents[i].file, base, op, reason);
> +        }
> +    }
> +}
> +
> +static void vmdk_op_recursive_unblock(BlockDriverState *bs,
> +                                      BlockDriverState *base,
> +                                      BlockOpType op,
> +                                      Error *reason)
> +{
> +    BDRVVmdkState *s = bs->opaque;
> +    int i;
> +
> +    for (i = 0; i < s->num_extents; i++) {
> +        if (s->extents[i].file) {
> +            bdrv_op_unblock(s->extents[i].file, base, op, reason);
> +        }
> +    }
> +}
> +
>  static BlockDriver bdrv_vmdk = {
>      .format_name                  = "vmdk",
>      .instance_size                = sizeof(BDRVVmdkState),
> @@ -2243,6 +2273,8 @@ static BlockDriver bdrv_vmdk = {
>      .bdrv_get_info                = vmdk_get_info,
>      .bdrv_detach_aio_context      = vmdk_detach_aio_context,
>      .bdrv_attach_aio_context      = vmdk_attach_aio_context,
> +    .bdrv_op_recursive_block      = vmdk_op_recursive_block,
> +    .bdrv_op_recursive_unblock    = vmdk_op_recursive_unblock,
>  
>      .supports_backing             = true,
>      .create_opts                  = &vmdk_create_opts,
> diff --git a/blockjob.c b/blockjob.c
> index 0689fdd..b7a98f6 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -49,7 +49,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
>      job = g_malloc0(driver->instance_size);
>      error_setg(&job->blocker, "block device is in use by block job: %s",
>                 BlockJobType_lookup[driver->job_type]);
> -    bdrv_op_block_all(bs, job->blocker);
> +    bdrv_op_block_all(bs, NULL, job->blocker);
>  
>      job->driver        = driver;
>      job->bs            = bs;
> @@ -65,7 +65,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
>          block_job_set_speed(job, speed, &local_err);
>          if (local_err) {
>              bs->job = NULL;
> -            bdrv_op_unblock_all(bs, job->blocker);
> +            bdrv_op_unblock_all(bs, NULL, job->blocker);
>              error_free(job->blocker);
>              g_free(job);
>              error_propagate(errp, local_err);
> @@ -82,7 +82,7 @@ void block_job_completed(BlockJob *job, int ret)
>      assert(bs->job == job);
>      job->cb(job->opaque, ret);
>      bs->job = NULL;
> -    bdrv_op_unblock_all(bs, job->blocker);
> +    bdrv_op_unblock_all(bs, NULL, job->blocker);
>      error_free(job->blocker);
>      g_free(job);
>  }
> diff --git a/include/block/block.h b/include/block/block.h
> index 10952ed..6d21a0b 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -480,10 +480,14 @@ void bdrv_ref(BlockDriverState *bs);
>  void bdrv_unref(BlockDriverState *bs);
>  
>  bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
> -void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
> -void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason);
> -void bdrv_op_block_all(BlockDriverState *bs, Error *reason);
> -void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason);
> +void bdrv_op_block(BlockDriverState *bs, BlockDriverState *base,
> +                   BlockOpType op, Error *reason);
> +void bdrv_op_unblock(BlockDriverState *bs, BlockDriverState *base,
> +                     BlockOpType op, Error *reason);
> +void bdrv_op_block_all(BlockDriverState *bs, BlockDriverState *base,
> +                       Error *reason);
> +void bdrv_op_unblock_all(BlockDriverState *bs, BlockDriverState *base,
> +                         Error *reason);
>  bool bdrv_op_blocker_is_empty(BlockDriverState *bs);
>  
>  typedef enum {
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 8d86a6c..fc93da6 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -270,6 +270,16 @@ struct BlockDriver {
>      void (*bdrv_io_unplug)(BlockDriverState *bs);
>      void (*bdrv_flush_io_queue)(BlockDriverState *bs);
>  
> +    /* helps blockers to propagate recursively */
> +    void (*bdrv_op_recursive_block)(BlockDriverState *bs,
> +                                    BlockDriverState *base,
> +                                    BlockOpType op,
> +                                    Error *reason);
> +    void (*bdrv_op_recursive_unblock)(BlockDriverState *bs,
> +                                      BlockDriverState *base,
> +                                      BlockOpType op,
> +                                      Error *reason);
> +

Seems like these new functions would be better named '.bdrv_op_block'
and '.bdrv_op_unblock'?  That way, recursive or not, it is clear block
drivers can implement whatever blocking is appropriate for themselves.

>      QLIST_ENTRY(BlockDriver) list;
>  };
>  
> -- 
> 2.1.0
>
Benoît Canet Oct. 1, 2014, 4:59 a.m. UTC | #2
> 
> Seems like these new functions would be better named '.bdrv_op_block'
> and '.bdrv_op_unblock'?  That way, recursive or not, it is clear block
> drivers can implement whatever blocking is appropriate for themselves.
> 
> >      QLIST_ENTRY(BlockDriver) list;
> >  };

Hi,

Thanks a lot for review.
I will mull this and respond in the begining of the afternoon.

Best regards

Benoît
Benoît Canet Oct. 1, 2014, 9:29 a.m. UTC | #3
Thanks a lot for reviewing this patch.

Since the code is not trivial I will give my arguments for writing it
this way.


> > +/* block recursively a BDS
> > + *
> > + * If base != NULL the caller must make sure that there is no multiple child
> > + * BDS in the subtree pointed to by bs
> 
> In the case when base != NULL, should that really matter?  In a
> driver's .bdrv_op_recursive_block/unblock, if that driver has private
> children (multiple or not), shouldn't the private children be treated
> as one black box, and blocked / unblocked just like the parent
> BDS?

This is a stale comment. My bad.

> 
> For instance, what if we had a tree such as this:
> 
>                        [quorum1] <---- [active]
>                            |
>                            | (private)
>                            |
>                            v
> [node2]<-- [node1] <--- [imag> 
> if 'quorum1' was to be op blocked, and 'image1' and its children all
> comprise 'quorum1', wouldn't we always want to lock 'image1' all the
> way down to 'node2'?

That's what the patch does.

> 
> Or do we expect that someone will intentionally pass a 'base' pointer
> along that matches somewhere in the private child tree?

This is not expected by the caller but I wrote the patch so it will also work.

> 
> > + *
> > + * @bs:     the BDS to start to recurse from
> > + * @base:   the BDS where the recursion should end
> > + *          could be NULL if the recursion should go down everywhere
> > + * @op:     the type of op blocker to block
> > + * @reason: the error message associated with the blocking
> > + */
> > +void bdrv_op_block(BlockDriverState *bs, BlockDriverState *base,
> > +                   BlockOpType op, Error *reason)
> > +{
> > +    if (!bs) {
> > +        return;
> > +    }
> > +
> > +    /* did we recurse down to base ? */
> > +    if (bs == base) {
> > +        return;
> > +    }
> > +
> > +    /* prevent recursion loop */
> > +    if (bdrv_op_is_blocked_by(bs, op, reason)) {
> > +        return;
> > +    }
> 
> Should we handle this somehow (isn't this effectively an error, that
> will prematurely end the blocking of this particular line)? 

The main purpose of this is mirror.c and commit.c would form BDS loops on completion.
These callers could break the look manually but the code would fail
if a loop is not breaked and the blocker function are called on it.
So the blocker code have to handle recursion loops.

The bdrv_op_is_blocked_by match a reason being the criteria to test the blocker.

So this test will trigger only if a BDS is already blocked by the same reason
pointer.

This make the bdrv_op_block function idempotent.

note that it is the only sane way I found to make the blockers function handle
loops.

> 
> If we hit this, we are going to end up in a scenario where we haven't
> blocked the chain as requested, and we don't know the state of the
> blocking below this failure point.  Seems like the caller should know,
> and we should have a way of cleaning up.

If we hit this the chain was already blocked with the same reason pointer.

> 
> Actually, now I am wondering if we perhaps shouldn't be building a
> list to block, and then blocking everything atomically once we know
> there are no failure points.
>

I don't think this particular test is a failure point.

> > +
> > +    /* block first for recursion loop protection to work */
> > +    bdrv_do_op_block(bs, op, reason);
> > +
> > +    bdrv_op_block(bs->file, base, op, reason);
> > +
> > +    if (bs->drv && bs->drv->supports_backing) {
> > +        bdrv_op_block(bs->backing_hd, base, op, reason);
> > +    }
> > +
> > +    if (bs->drv && bs->drv->bdrv_op_recursive_block) {
> > +        bs->drv->bdrv_op_recursive_block(bs, base, op, reason);
> 
> Do we need to allow .bdrv_op_recursive_block() to fail and return
> error (and handle it, of course)?

I don't know yet: but lets let this question float a little more in the mail thread.

> 
> 
> 
> s/block/unblock
Thanks

> 
> 
> > + * @reason: the error message associated with the blocking
> > + */
> > +void bdrv_op_unblock(BlockDriverState *bs, BlockDriverState *base,
> > +                     BlockOpType op, Error *reason)
> > +{
> > +    if (!bs) {
> > +        return;
> > +    }
> > +
> > +    /* did we recurse down to base ? */
> > +    if (bs == base) {
> > +        return;
> > +    }
> > +
> > +    /* prevent recursion loop */
> > +    if (!bdrv_op_is_blocked_by(bs, op, reason)) {
> > +        return;
> > +    }
> 
> Do we want to do this negative check here?  Even if a given node is
> not blocked, its children may be, and I would assume (since this is
> recursive) that if I called bdrv_op_unblock() all of the chain down to
> 'base' would be unblocked for the given reason.
> 
> E.g. 
> 
>    [image1] <-- [image2] <-- [image3]
>    blocked      unblocked     blocked

To reach this state the caller code would have to do the following twisted sequence.

block(image3, with_reason1)
unblock(image2, with_reason1)
block(image1, with_reason1)

There is no such sequence in the code thanks to the base argument and we could
enforce that no such sequence ever get written.

> 
> I would assume that bdrv_op_unblock(image2, NULL, reason) would still
> unblock image1, even if image2 was unblocked.

Should we also assume that bdrv_op_unblock(image4, NULL, reason) with image4 being
image3 parent unblock everything underneath ?

> > +static void quorum_op_recursive_block(BlockDriverState *bs,
> > +                                      BlockDriverState *base,
> > +                                      BlockOpType op,
> > +                                      Error *reason)
> > +{
> > +    BDRVQuorumState *s = bs->opaque;
> > +    int i;
> > +
> > +    for (i = 0; i < s->num_children; i++) {
> > +        bdrv_op_block(s->bs[i], base, op, reason);
> > +    }
> 
> I am wondering if the 'base' argument above should always be NULL in
> the case of private children.  I.e., the state of private children
> trees in their entirety should always reflect the state of the
> multi-children BDS parent.

Yes we could simply discard it.

> > +    /* helps blockers to propagate recursively */
> > +    void (*bdrv_op_recursive_block)(BlockDriverState *bs,
> > +                                    BlockDriverState *base,
> > +                                    BlockOpType op,
> > +                                    Error *reason);
> > +    void (*bdrv_op_recursive_unblock)(BlockDriverState *bs,
> > +                                      BlockDriverState *base,
> > +                                      BlockOpType op,
> > +                                      Error *reason);
> > +
> 
> Seems like these new functions would be better named '.bdrv_op_block'
> and '.bdrv_op_unblock'?  That way, recursive or not, it is clear block
> drivers can implement whatever blocking is appropriate for themselves.

I agree.

Best regards

Benoît
Jeff Cody Oct. 1, 2014, 3:19 p.m. UTC | #4
On Wed, Oct 01, 2014 at 09:29:44AM +0000, Benoît Canet wrote:
> 
> Thanks a lot for reviewing this patch.
> 
> Since the code is not trivial I will give my arguments for writing it
> this way.
>

Thanks, that is very helpful.

> 
> > > +/* block recursively a BDS
> > > + *
> > > + * If base != NULL the caller must make sure that there is no multiple child
> > > + * BDS in the subtree pointed to by bs
> > 
> > In the case when base != NULL, should that really matter?  In a
> > driver's .bdrv_op_recursive_block/unblock, if that driver has private
> > children (multiple or not), shouldn't the private children be treated
> > as one black box, and blocked / unblocked just like the parent
> > BDS?
> 
> This is a stale comment. My bad.
> 
> > 
> > For instance, what if we had a tree such as this:
> > 
> >                        [quorum1] <---- [active]
> >                            |
> >                            | (private)
> >                            |
> >                            v
> > [node2]<-- [node1] <--- [imag> 
> > if 'quorum1' was to be op blocked, and 'image1' and its children all
> > comprise 'quorum1', wouldn't we always want to lock 'image1' all the
> > way down to 'node2'?
> 
> That's what the patch does.
> 

OK, great - my comment above was written in response to the stale comment :)

> > 
> > Or do we expect that someone will intentionally pass a 'base' pointer
> > along that matches somewhere in the private child tree?
> 
> This is not expected by the caller but I wrote the patch so it will also work.
> 
> > 
> > > + *
> > > + * @bs:     the BDS to start to recurse from
> > > + * @base:   the BDS where the recursion should end
> > > + *          could be NULL if the recursion should go down everywhere
> > > + * @op:     the type of op blocker to block
> > > + * @reason: the error message associated with the blocking
> > > + */
> > > +void bdrv_op_block(BlockDriverState *bs, BlockDriverState *base,
> > > +                   BlockOpType op, Error *reason)
> > > +{
> > > +    if (!bs) {
> > > +        return;
> > > +    }
> > > +
> > > +    /* did we recurse down to base ? */
> > > +    if (bs == base) {
> > > +        return;
> > > +    }
> > > +
> > > +    /* prevent recursion loop */
> > > +    if (bdrv_op_is_blocked_by(bs, op, reason)) {
> > > +        return;
> > > +    }
> > 
> > Should we handle this somehow (isn't this effectively an error, that
> > will prematurely end the blocking of this particular line)? 
> 
> The main purpose of this is mirror.c and commit.c would form BDS loops on completion.
> These callers could break the look manually but the code would fail
> if a loop is not breaked and the blocker function are called on it.
> So the blocker code have to handle recursion loops.

I think commit/mirror/etc should break any loops prior to calling
recursive functions on those loops (just like it should do before
calling bdrv_unref(), etc..).  Otherwise, I think the recursive
functions make assumptions that may be true in certain contexts, but
not necessarily all.

(Hmm, I believe that Fam had a series that did some nice cleanup on
bdrv_drop_intermediate() and related areas that did some loop
breaking, IIRC).

> 
> The bdrv_op_is_blocked_by match a reason being the criteria to test the blocker.
> 
> So this test will trigger only if a BDS is already blocked by the same reason
> pointer.
> 
> This make the bdrv_op_block function idempotent.
> 
> note that it is the only sane way I found to make the blockers function handle
> loops.
> 
> > 
> > If we hit this, we are going to end up in a scenario where we haven't
> > blocked the chain as requested, and we don't know the state of the
> > blocking below this failure point.  Seems like the caller should know,
> > and we should have a way of cleaning up.
> 
> If we hit this the chain was already blocked with the same reason pointer.
> 
> > 
> > Actually, now I am wondering if we perhaps shouldn't be building a
> > list to block, and then blocking everything atomically once we know
> > there are no failure points.
> >
> 
> I don't think this particular test is a failure point.
>

With the way it is written, the individual BDS is blocked with the
same reason pointer, but not necessarily the whole chain - unless you
make the assumption that blockers will only be used via the recursive
interface, and never individually on a node.

The caller doesn't have an interface to check that the chain is not
blocked - bdrv_op_is_blocked_by() doesn't operate recursively.  

If we tried to block a chain that has some portion already blocked for
the same reason, shouldn't that be an error?


> > > +
> > > +    /* block first for recursion loop protection to work */
> > > +    bdrv_do_op_block(bs, op, reason);
> > > +
> > > +    bdrv_op_block(bs->file, base, op, reason);
> > > +
> > > +    if (bs->drv && bs->drv->supports_backing) {
> > > +        bdrv_op_block(bs->backing_hd, base, op, reason);
> > > +    }
> > > +
> > > +    if (bs->drv && bs->drv->bdrv_op_recursive_block) {
> > > +        bs->drv->bdrv_op_recursive_block(bs, base, op, reason);
> > 
> > Do we need to allow .bdrv_op_recursive_block() to fail and return
> > error (and handle it, of course)?
> 
> I don't know yet: but lets let this question float a little more in the mail thread.
> 
> > 
> > 
> > 
> > s/block/unblock
> Thanks
> 
> > 
> > 
> > > + * @reason: the error message associated with the blocking
> > > + */
> > > +void bdrv_op_unblock(BlockDriverState *bs, BlockDriverState *base,
> > > +                     BlockOpType op, Error *reason)
> > > +{
> > > +    if (!bs) {
> > > +        return;
> > > +    }
> > > +
> > > +    /* did we recurse down to base ? */
> > > +    if (bs == base) {
> > > +        return;
> > > +    }
> > > +
> > > +    /* prevent recursion loop */
> > > +    if (!bdrv_op_is_blocked_by(bs, op, reason)) {
> > > +        return;
> > > +    }
> > 
> > Do we want to do this negative check here?  Even if a given node is
> > not blocked, its children may be, and I would assume (since this is
> > recursive) that if I called bdrv_op_unblock() all of the chain down to
> > 'base' would be unblocked for the given reason.
> > 
> > E.g. 
> > 
> >    [image1] <-- [image2] <-- [image3]
> >    blocked      unblocked     blocked
> 
> To reach this state the caller code would have to do the following twisted sequence.
> 
> block(image3, with_reason1)
> unblock(image2, with_reason1)
> block(image1, with_reason1)
>
> There is no such sequence in the code thanks to the base argument and we could
> enforce that no such sequence ever get written.
>

If we ignore blockdev-add and scenarios where an image node may have
multiple overlays, we might be able to assume that such a sequence
could not exist.

But in that case, should this negative check result in an error?

It would seem at this point we would have encountered one of these
scenarios:

1.) An invalid block/unblock state in the chain, if we assume that no
such sequence should exist

2.) We tried to unblock more than we originally blocked

> > 
> > I would assume that bdrv_op_unblock(image2, NULL, reason) would still
> > unblock image1, even if image2 was unblocked.
> 
> Should we also assume that bdrv_op_unblock(image4, NULL, reason) with image4 being
> image3 parent unblock everything underneath ?
> 

I think we either do that, or return an error.  But to have
bdrv_op_unblock() (or bdrv_op_block()) silently stop at some point in
the chain prior to reaching 'base' doesn't seem correct to me.

> > > +static void quorum_op_recursive_block(BlockDriverState *bs,
> > > +                                      BlockDriverState *base,
> > > +                                      BlockOpType op,
> > > +                                      Error *reason)
> > > +{
> > > +    BDRVQuorumState *s = bs->opaque;
> > > +    int i;
> > > +
> > > +    for (i = 0; i < s->num_children; i++) {
> > > +        bdrv_op_block(s->bs[i], base, op, reason);
> > > +    }
> > 
> > I am wondering if the 'base' argument above should always be NULL in
> > the case of private children.  I.e., the state of private children
> > trees in their entirety should always reflect the state of the
> > multi-children BDS parent.
> 
> Yes we could simply discard it.
> 
> > > +    /* helps blockers to propagate recursively */
> > > +    void (*bdrv_op_recursive_block)(BlockDriverState *bs,
> > > +                                    BlockDriverState *base,
> > > +                                    BlockOpType op,
> > > +                                    Error *reason);
> > > +    void (*bdrv_op_recursive_unblock)(BlockDriverState *bs,
> > > +                                      BlockDriverState *base,
> > > +                                      BlockOpType op,
> > > +                                      Error *reason);
> > > +
> > 
> > Seems like these new functions would be better named '.bdrv_op_block'
> > and '.bdrv_op_unblock'?  That way, recursive or not, it is clear block
> > drivers can implement whatever blocking is appropriate for themselves.
> 
> I agree.
> 
> Best regards
> 
> Benoît
Benoît Canet Oct. 1, 2014, 3:42 p.m. UTC | #5
> > 
> > The main purpose of this is mirror.c and commit.c would form BDS loops on completion.
> > These callers could break the look manually but the code would fail
> > if a loop is not breaked and the blocker function are called on it.
> > So the blocker code have to handle recursion loops.
> 
> I think commit/mirror/etc should break any loops prior to calling
> recursive functions on those loops (just like it should do before
> calling bdrv_unref(), etc..).  Otherwise, I think the recursive
> functions make assumptions that may be true in certain contexts, but
> not necessarily all.
> 
> (Hmm, I believe that Fam had a series that did some nice cleanup on
> bdrv_drop_intermediate() and related areas that did some loop
> breaking, IIRC).

Ok I could use that as a basis.

> > 
> > I don't think this particular test is a failure point.
> >
> 
> With the way it is written, the individual BDS is blocked with the
> same reason pointer, but not necessarily the whole chain - unless you
> make the assumption that blockers will only be used via the recursive
> interface, and never individually on a node.

there is no more a no recursive version with this patch so this assumption
will be respected.

> 
> The caller doesn't have an interface to check that the chain is not
> blocked - bdrv_op_is_blocked_by() doesn't operate recursively.  
> 
> If we tried to block a chain that has some portion already blocked for
> the same reason, shouldn't that be an error?

Why should we allow this ?
My understanding is that blocking something should be associated to a
single operation whatever they are.
So one operation to block implying one different reason is not so strange.

> > > > +
> > > > +    /* block first for recursion loop protection to work */
> > > > +    bdrv_do_op_block(bs, op, reason);
> > > > +
> > > > +    bdrv_op_block(bs->file, base, op, reason);
> > > > +
> > > > +    if (bs->drv && bs->drv->supports_backing) {
> > > > +        bdrv_op_block(bs->backing_hd, base, op, reason);
> > > > +    }
> > > > +
> > > > +    if (bs->drv && bs->drv->bdrv_op_recursive_block) {
> > > > +        bs->drv->bdrv_op_recursive_block(bs, base, op, reason);
> > > 
> > > Do we need to allow .bdrv_op_recursive_block() to fail and return
> > > error (and handle it, of course)?
> > 
> > I don't know yet: but lets let this question float a little more in the mail thread.
> > 
> > > 
> > > 
> > > 


> > 
> > To reach this state the caller code would have to do the following twisted sequence.
> > 
> > block(image3, with_reason1)
> > unblock(image2, with_reason1)
> > block(image1, with_reason1)
> >
> > There is no such sequence in the code thanks to the base argument and we could
> > enforce that no such sequence ever get written.
> >
> 
> If we ignore blockdev-add and scenarios where an image node may have
> multiple overlays, we might be able to assume that such a sequence
> could not exist.
> 
> But in that case, should this negative check result in an error?
> 
> It would seem at this point we would have encountered one of these
> scenarios:
> 
> 1.) An invalid block/unblock state in the chain, if we assume that no
> such sequence should exist
> 
> 2.) We tried to unblock more than we originally blocked

> 
> > > 
> > > I would assume that bdrv_op_unblock(image2, NULL, reason) would still
> > > unblock image1, even if image2 was unblocked.
> > 
> > Should we also assume that bdrv_op_unblock(image4, NULL, reason) with image4 being
> > image3 parent unblock everything underneath ?
> > 
> 
> I think we either do that, or return an error.  But to have
> bdrv_op_unblock() (or bdrv_op_block()) silently stop at some point in
> the chain prior to reaching 'base' doesn't seem correct to me.
> 

Maybe you are right.
I don't mind rewriting the patchset with error handling and without the recursion
loop avoidance code given I find Fam's loop breaking patches on the list.

I remember trying to write loop breaking by myself just before 2.1 and it was
annoying.

Best regards

Benoît
diff mbox

Patch

diff --git a/block-migration.c b/block-migration.c
index 3ad31a2..ad7aa03 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -362,7 +362,7 @@  static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
         bmds->shared_base = block_mig_state.shared_base;
         alloc_aio_bitmap(bmds);
         error_setg(&bmds->blocker, "block device is in use by migration");
-        bdrv_op_block_all(bs, bmds->blocker);
+        bdrv_op_block_all(bs, NULL, bmds->blocker);
         bdrv_ref(bs);
 
         block_mig_state.total_sector_sum += sectors;
@@ -600,7 +600,7 @@  static void blk_mig_cleanup(void)
     blk_mig_lock();
     while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
         QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
-        bdrv_op_unblock_all(bmds->bs, bmds->blocker);
+        bdrv_op_unblock_all(bmds->bs, NULL, bmds->blocker);
         error_free(bmds->blocker);
         bdrv_unref(bmds->bs);
         g_free(bmds->aio_bitmap);
diff --git a/block.c b/block.c
index 9a9f8a0..b2d6953 100644
--- a/block.c
+++ b/block.c
@@ -1148,7 +1148,7 @@  void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
 
     if (bs->backing_hd) {
         assert(bs->backing_blocker);
-        bdrv_op_unblock_all(bs->backing_hd, bs->backing_blocker);
+        bdrv_op_unblock_all(bs->backing_hd, NULL, bs->backing_blocker);
     } else if (backing_hd) {
         error_setg(&bs->backing_blocker,
                    "device is used as backing hd of '%s'",
@@ -1166,9 +1166,9 @@  void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd)
     pstrcpy(bs->backing_format, sizeof(bs->backing_format),
             backing_hd->drv ? backing_hd->drv->format_name : "");
 
-    bdrv_op_block_all(bs->backing_hd, bs->backing_blocker);
+    bdrv_op_block_all(bs->backing_hd, NULL, bs->backing_blocker);
     /* Otherwise we won't be able to commit due to check in bdrv_commit */
-    bdrv_op_unblock(bs->backing_hd, BLOCK_OP_TYPE_COMMIT,
+    bdrv_op_unblock(bs->backing_hd, NULL, BLOCK_OP_TYPE_COMMIT,
                     bs->backing_blocker);
 out:
     bdrv_refresh_limits(bs, NULL);
@@ -5482,7 +5482,9 @@  bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp)
     return false;
 }
 
-void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
+/* do the real work of blocking a BDS */
+static void bdrv_do_op_block(BlockDriverState *bs, BlockOpType op,
+                             Error *reason)
 {
     BdrvOpBlocker *blocker;
     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
@@ -5492,7 +5494,9 @@  void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason)
     QLIST_INSERT_HEAD(&bs->op_blockers[op], blocker, list);
 }
 
-void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
+/* do the real work of unblocking a BDS */
+static void bdrv_do_op_unblock(BlockDriverState *bs, BlockOpType op,
+                               Error *reason)
 {
     BdrvOpBlocker *blocker, *next;
     assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
@@ -5504,19 +5508,118 @@  void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason)
     }
 }
 
-void bdrv_op_block_all(BlockDriverState *bs, Error *reason)
+static bool bdrv_op_is_blocked_by(BlockDriverState *bs, BlockOpType op,
+                                  Error *reason)
+{
+    BdrvOpBlocker *blocker;
+    assert((int) op >= 0 && op < BLOCK_OP_TYPE_MAX);
+    QLIST_FOREACH(blocker, &bs->op_blockers[op], list) {
+        if (blocker->reason == reason) {
+            return true;
+        }
+    }
+    return false;
+}
+
+/* block recursively a BDS
+ *
+ * If base != NULL the caller must make sure that there is no multiple child
+ * BDS in the subtree pointed to by bs
+ *
+ * @bs:     the BDS to start to recurse from
+ * @base:   the BDS where the recursion should end
+ *          could be NULL if the recursion should go down everywhere
+ * @op:     the type of op blocker to block
+ * @reason: the error message associated with the blocking
+ */
+void bdrv_op_block(BlockDriverState *bs, BlockDriverState *base,
+                   BlockOpType op, Error *reason)
+{
+    if (!bs) {
+        return;
+    }
+
+    /* did we recurse down to base ? */
+    if (bs == base) {
+        return;
+    }
+
+    /* prevent recursion loop */
+    if (bdrv_op_is_blocked_by(bs, op, reason)) {
+        return;
+    }
+
+    /* block first for recursion loop protection to work */
+    bdrv_do_op_block(bs, op, reason);
+
+    bdrv_op_block(bs->file, base, op, reason);
+
+    if (bs->drv && bs->drv->supports_backing) {
+        bdrv_op_block(bs->backing_hd, base, op, reason);
+    }
+
+    if (bs->drv && bs->drv->bdrv_op_recursive_block) {
+        bs->drv->bdrv_op_recursive_block(bs, base, op, reason);
+    }
+}
+
+/* unblock recursively a BDS
+ *
+ * If base != NULL the caller must make sure that there is no multiple child
+ * BDS in the subtree pointed to by bs
+ *
+ * @bs:     the BDS to start to recurse from
+ * @base:   the BDS where the recursion should end
+ *          could be NULL if the recursion should go down everywhere
+ * @op:     the type of op blocker to block
+ * @reason: the error message associated with the blocking
+ */
+void bdrv_op_unblock(BlockDriverState *bs, BlockDriverState *base,
+                     BlockOpType op, Error *reason)
+{
+    if (!bs) {
+        return;
+    }
+
+    /* did we recurse down to base ? */
+    if (bs == base) {
+        return;
+    }
+
+    /* prevent recursion loop */
+    if (!bdrv_op_is_blocked_by(bs, op, reason)) {
+        return;
+    }
+
+    /* block first for recursion loop protection to work */
+    bdrv_do_op_unblock(bs, op, reason);
+
+    bdrv_op_unblock(bs->file, base, op, reason);
+
+    if (bs->drv && bs->drv->supports_backing) {
+        bdrv_op_unblock(bs->backing_hd, base, op, reason);
+    }
+
+    if (bs->drv && bs->drv->bdrv_op_recursive_block) {
+        bs->drv->bdrv_op_recursive_unblock(bs, base, op, reason);
+    }
+}
+
+void bdrv_op_block_all(BlockDriverState *bs, BlockDriverState *base,
+                       Error *reason)
 {
     int i;
     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
-        bdrv_op_block(bs, i, reason);
+        bdrv_op_block(bs, base, i, reason);
     }
 }
 
-void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason)
+void bdrv_op_unblock_all(BlockDriverState *bs, BlockDriverState *base,
+                         Error *reason)
 {
     int i;
     for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
-        bdrv_op_unblock(bs, i, reason);
+        bdrv_op_unblock(bs, base, i, reason);
     }
 }
 
diff --git a/block/blkverify.c b/block/blkverify.c
index 163064c..5a4fa7c 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -349,6 +349,24 @@  static void blkverify_refresh_filename(BlockDriverState *bs)
     }
 }
 
+static void blkverify_op_recursive_block(BlockDriverState *bs,
+                                         BlockDriverState *base,
+                                         BlockOpType op,
+                                         Error *reason)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    bdrv_op_block(s->test_file, base, op, reason);
+}
+
+static void blkverify_op_recursive_unblock(BlockDriverState *bs,
+                                           BlockDriverState *base,
+                                           BlockOpType op,
+                                           Error *reason)
+{
+    BDRVBlkverifyState *s = bs->opaque;
+    bdrv_op_unblock(s->test_file, base, op, reason);
+}
+
 static BlockDriver bdrv_blkverify = {
     .format_name                      = "blkverify",
     .protocol_name                    = "blkverify",
@@ -367,6 +385,9 @@  static BlockDriver bdrv_blkverify = {
     .bdrv_attach_aio_context          = blkverify_attach_aio_context,
     .bdrv_detach_aio_context          = blkverify_detach_aio_context,
 
+    .bdrv_op_recursive_block          = blkverify_op_recursive_block,
+    .bdrv_op_recursive_unblock        = blkverify_op_recursive_unblock,
+
     .is_filter                        = true,
     .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter,
 };
diff --git a/block/commit.c b/block/commit.c
index 91517d3..6a514d4 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -142,6 +142,8 @@  wait:
 
     if (!block_job_is_cancelled(&s->common) && sector_num == end) {
         /* success */
+        /* unblock only BDS to be dropped */
+        bdrv_op_unblock_all(top, base, s->common.blocker);
         ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
     }
 
diff --git a/block/mirror.c b/block/mirror.c
index 18b18e0..6f6071a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -322,6 +322,7 @@  static void coroutine_fn mirror_run(void *opaque)
     uint64_t last_pause_ns;
     BlockDriverInfo bdi;
     char backing_filename[1024];
+    BlockDriverState *to_replace;
     int ret = 0;
     int n;
 
@@ -510,14 +511,16 @@  immediate_exit:
     g_free(s->in_flight_bitmap);
     bdrv_release_dirty_bitmap(bs, s->dirty_bitmap);
     bdrv_iostatus_disable(s->target);
+    to_replace = s->common.bs;
+    if (s->to_replace) {
+        bdrv_op_unblock_all(s->to_replace, NULL, s->replace_blocker);
+        to_replace = s->to_replace;
+    }
     if (s->should_complete && ret == 0) {
-        BlockDriverState *to_replace = s->common.bs;
-        if (s->to_replace) {
-            to_replace = s->to_replace;
-        }
         if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) {
             bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL);
         }
+        bdrv_op_unblock_all(to_replace, NULL, bs->job->blocker);
         bdrv_swap(s->target, to_replace);
         if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) {
             /* drop the bs loop chain formed by the swap: break the loop then
@@ -528,7 +531,6 @@  immediate_exit:
         }
     }
     if (s->to_replace) {
-        bdrv_op_unblock_all(s->to_replace, s->replace_blocker);
         error_free(s->replace_blocker);
         bdrv_unref(s->to_replace);
     }
@@ -581,7 +583,7 @@  static void mirror_complete(BlockJob *job, Error **errp)
 
         error_setg(&s->replace_blocker,
                    "block device is in use by block-job-complete");
-        bdrv_op_block_all(s->to_replace, s->replace_blocker);
+        bdrv_op_block_all(s->to_replace, NULL, s->replace_blocker);
         bdrv_ref(s->to_replace);
     }
 
@@ -640,12 +642,22 @@  static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
         return;
     }
 
-
     s = block_job_create(driver, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
     }
 
+    /* block_job_create block all block job operations.
+     * If replaces is specified the block-job-complete code will have to block
+     * BLOCK_OP_TYPE_MIRROR_REPLACE during the time the mirror complete.
+     * Conditionally unblock BLOCK_OP_TYPE_MIRROR_REPLACE so the
+     * block-job-complete can do its work.
+     */
+    if (replaces) {
+        bdrv_op_unblock(bs, NULL, BLOCK_OP_TYPE_MIRROR_REPLACE,
+                        bs->job->blocker);
+    }
+
     s->replaces = g_strdup(replaces);
     s->on_source_error = on_source_error;
     s->on_target_error = on_target_error;
diff --git a/block/quorum.c b/block/quorum.c
index 093382e..7b06911 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -1065,6 +1065,32 @@  static void quorum_refresh_filename(BlockDriverState *bs)
     bs->full_open_options = opts;
 }
 
+static void quorum_op_recursive_block(BlockDriverState *bs,
+                                      BlockDriverState *base,
+                                      BlockOpType op,
+                                      Error *reason)
+{
+    BDRVQuorumState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_op_block(s->bs[i], base, op, reason);
+    }
+}
+
+static void quorum_op_recursive_unblock(BlockDriverState *bs,
+                                        BlockDriverState *base,
+                                        BlockOpType op,
+                                        Error *reason)
+{
+    BDRVQuorumState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_children; i++) {
+        bdrv_op_unblock(s->bs[i], base, op, reason);
+    }
+}
+
 static BlockDriver bdrv_quorum = {
     .format_name                        = "quorum",
     .protocol_name                      = "quorum",
@@ -1086,6 +1112,9 @@  static BlockDriver bdrv_quorum = {
     .bdrv_detach_aio_context            = quorum_detach_aio_context,
     .bdrv_attach_aio_context            = quorum_attach_aio_context,
 
+    .bdrv_op_recursive_block            = quorum_op_recursive_block,
+    .bdrv_op_recursive_unblock          = quorum_op_recursive_unblock,
+
     .is_filter                          = true,
     .bdrv_recurse_is_first_non_filter   = quorum_recurse_is_first_non_filter,
 };
diff --git a/block/stream.c b/block/stream.c
index cdea3e8..a9e69a5 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -192,6 +192,8 @@  wait:
             }
         }
         ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+        /* unblock only BDS to be dropped */
+        bdrv_op_unblock_all(bs->backing_hd, base, s->common.blocker);
         close_unused_images(bs, base, base_id);
     }
 
diff --git a/block/vmdk.c b/block/vmdk.c
index afdea1a..257f683 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2221,6 +2221,36 @@  static QemuOptsList vmdk_create_opts = {
     }
 };
 
+static void vmdk_op_recursive_block(BlockDriverState *bs,
+                                    BlockDriverState *base,
+                                    BlockOpType op,
+                                    Error *reason)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_extents; i++) {
+        if (s->extents[i].file) {
+            bdrv_op_block(s->extents[i].file, base, op, reason);
+        }
+    }
+}
+
+static void vmdk_op_recursive_unblock(BlockDriverState *bs,
+                                      BlockDriverState *base,
+                                      BlockOpType op,
+                                      Error *reason)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int i;
+
+    for (i = 0; i < s->num_extents; i++) {
+        if (s->extents[i].file) {
+            bdrv_op_unblock(s->extents[i].file, base, op, reason);
+        }
+    }
+}
+
 static BlockDriver bdrv_vmdk = {
     .format_name                  = "vmdk",
     .instance_size                = sizeof(BDRVVmdkState),
@@ -2243,6 +2273,8 @@  static BlockDriver bdrv_vmdk = {
     .bdrv_get_info                = vmdk_get_info,
     .bdrv_detach_aio_context      = vmdk_detach_aio_context,
     .bdrv_attach_aio_context      = vmdk_attach_aio_context,
+    .bdrv_op_recursive_block      = vmdk_op_recursive_block,
+    .bdrv_op_recursive_unblock    = vmdk_op_recursive_unblock,
 
     .supports_backing             = true,
     .create_opts                  = &vmdk_create_opts,
diff --git a/blockjob.c b/blockjob.c
index 0689fdd..b7a98f6 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -49,7 +49,7 @@  void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
     job = g_malloc0(driver->instance_size);
     error_setg(&job->blocker, "block device is in use by block job: %s",
                BlockJobType_lookup[driver->job_type]);
-    bdrv_op_block_all(bs, job->blocker);
+    bdrv_op_block_all(bs, NULL, job->blocker);
 
     job->driver        = driver;
     job->bs            = bs;
@@ -65,7 +65,7 @@  void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
         block_job_set_speed(job, speed, &local_err);
         if (local_err) {
             bs->job = NULL;
-            bdrv_op_unblock_all(bs, job->blocker);
+            bdrv_op_unblock_all(bs, NULL, job->blocker);
             error_free(job->blocker);
             g_free(job);
             error_propagate(errp, local_err);
@@ -82,7 +82,7 @@  void block_job_completed(BlockJob *job, int ret)
     assert(bs->job == job);
     job->cb(job->opaque, ret);
     bs->job = NULL;
-    bdrv_op_unblock_all(bs, job->blocker);
+    bdrv_op_unblock_all(bs, NULL, job->blocker);
     error_free(job->blocker);
     g_free(job);
 }
diff --git a/include/block/block.h b/include/block/block.h
index 10952ed..6d21a0b 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -480,10 +480,14 @@  void bdrv_ref(BlockDriverState *bs);
 void bdrv_unref(BlockDriverState *bs);
 
 bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
-void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
-void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason);
-void bdrv_op_block_all(BlockDriverState *bs, Error *reason);
-void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason);
+void bdrv_op_block(BlockDriverState *bs, BlockDriverState *base,
+                   BlockOpType op, Error *reason);
+void bdrv_op_unblock(BlockDriverState *bs, BlockDriverState *base,
+                     BlockOpType op, Error *reason);
+void bdrv_op_block_all(BlockDriverState *bs, BlockDriverState *base,
+                       Error *reason);
+void bdrv_op_unblock_all(BlockDriverState *bs, BlockDriverState *base,
+                         Error *reason);
 bool bdrv_op_blocker_is_empty(BlockDriverState *bs);
 
 typedef enum {
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 8d86a6c..fc93da6 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -270,6 +270,16 @@  struct BlockDriver {
     void (*bdrv_io_unplug)(BlockDriverState *bs);
     void (*bdrv_flush_io_queue)(BlockDriverState *bs);
 
+    /* helps blockers to propagate recursively */
+    void (*bdrv_op_recursive_block)(BlockDriverState *bs,
+                                    BlockDriverState *base,
+                                    BlockOpType op,
+                                    Error *reason);
+    void (*bdrv_op_recursive_unblock)(BlockDriverState *bs,
+                                      BlockDriverState *base,
+                                      BlockOpType op,
+                                      Error *reason);
+
     QLIST_ENTRY(BlockDriver) list;
 };