diff mbox series

[02/18] block: BDS deletion during bdrv_drain_recurse

Message ID 20170913181910.29688-3-mreitz@redhat.com
State New
Headers show
Series block/mirror: Add active-sync mirroring | expand

Commit Message

Max Reitz Sept. 13, 2017, 6:18 p.m. UTC
Drainined a BDS child may lead to both the original BDS and/or its other
children being deleted (e.g. if the original BDS represents a block
job).  We should prepare for this in both bdrv_drain_recurse() and
bdrv_drained_begin() by monitoring whether the BDS we are about to drain
still exists at all.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/io.c | 72 +++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 52 insertions(+), 20 deletions(-)

Comments

Fam Zheng Sept. 18, 2017, 3:44 a.m. UTC | #1
On Wed, 09/13 20:18, Max Reitz wrote:
> Drainined a BDS child may lead to both the original BDS and/or its other
> children being deleted (e.g. if the original BDS represents a block
> job).  We should prepare for this in both bdrv_drain_recurse() and
> bdrv_drained_begin() by monitoring whether the BDS we are about to drain
> still exists at all.

Can the deletion happen when IOThread calls
bdrv_drain_recurse/bdrv_drained_begin?  If not, is it enough to do

    ...
    if (in_main_loop) {
        bdrv_ref(bs);
    }
    ...
    if (in_main_loop) {
        bdrv_unref(bs);
    }

to protect the main loop case? So the BdrvDeletedStatus state is not needed.

Fam

> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/io.c | 72 +++++++++++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 52 insertions(+), 20 deletions(-)
> 
> diff --git a/block/io.c b/block/io.c
> index 4378ae4c7d..8ec1a564ad 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -182,33 +182,57 @@ static void bdrv_drain_invoke(BlockDriverState *bs)
>  
>  static bool bdrv_drain_recurse(BlockDriverState *bs)
>  {
> -    BdrvChild *child, *tmp;
> +    BdrvChild *child;
>      bool waited;
> +    struct BDSToDrain {
> +        BlockDriverState *bs;
> +        BdrvDeletedStatus del_stat;
> +        QLIST_ENTRY(BDSToDrain) next;
> +    };
> +    QLIST_HEAD(, BDSToDrain) bs_list = QLIST_HEAD_INITIALIZER(bs_list);
> +    bool in_main_loop =
> +        qemu_get_current_aio_context() == qemu_get_aio_context();
>  
>      waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
>  
>      /* Ensure any pending metadata writes are submitted to bs->file.  */
>      bdrv_drain_invoke(bs);
>  
> -    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
> -        BlockDriverState *bs = child->bs;
> -        bool in_main_loop =
> -            qemu_get_current_aio_context() == qemu_get_aio_context();
> -        assert(bs->refcnt > 0);
> -        if (in_main_loop) {
> -            /* In case the recursive bdrv_drain_recurse processes a
> -             * block_job_defer_to_main_loop BH and modifies the graph,
> -             * let's hold a reference to bs until we are done.
> -             *
> -             * IOThread doesn't have such a BH, and it is not safe to call
> -             * bdrv_unref without BQL, so skip doing it there.
> -             */
> -            bdrv_ref(bs);
> -        }
> -        waited |= bdrv_drain_recurse(bs);
> -        if (in_main_loop) {
> -            bdrv_unref(bs);
> +    /* Draining children may result in other children being removed and maybe
> +     * even deleted, so copy the children list first */
> +    QLIST_FOREACH(child, &bs->children, next) {
> +        struct BDSToDrain *bs2d = g_new0(struct BDSToDrain, 1);
> +
> +        bs2d->bs = child->bs;
> +        QLIST_INSERT_HEAD(&bs->deleted_status, &bs2d->del_stat, next);
> +
> +        QLIST_INSERT_HEAD(&bs_list, bs2d, next);
> +    }
> +
> +    while (!QLIST_EMPTY(&bs_list)) {
> +        struct BDSToDrain *bs2d = QLIST_FIRST(&bs_list);
> +        QLIST_REMOVE(bs2d, next);
> +
> +        if (!bs2d->del_stat.deleted) {
> +            QLIST_REMOVE(&bs2d->del_stat, next);
> +
> +            if (in_main_loop) {
> +                /* In case the recursive bdrv_drain_recurse processes a
> +                 * block_job_defer_to_main_loop BH and modifies the graph,
> +                 * let's hold a reference to the BDS until we are done.
> +                 *
> +                 * IOThread doesn't have such a BH, and it is not safe to call
> +                 * bdrv_unref without BQL, so skip doing it there.
> +                 */
> +                bdrv_ref(bs2d->bs);
> +            }
> +            waited |= bdrv_drain_recurse(bs2d->bs);
> +            if (in_main_loop) {
> +                bdrv_unref(bs2d->bs);
> +            }
>          }
> +
> +        g_free(bs2d);
>      }
>  
>      return waited;
> @@ -252,17 +276,25 @@ static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
>  
>  void bdrv_drained_begin(BlockDriverState *bs)
>  {
> +    BdrvDeletedStatus del_stat = { .deleted = false };
> +
>      if (qemu_in_coroutine()) {
>          bdrv_co_yield_to_drain(bs);
>          return;
>      }
>  
> +    QLIST_INSERT_HEAD(&bs->deleted_status, &del_stat, next);
> +
>      if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
>          aio_disable_external(bdrv_get_aio_context(bs));
>          bdrv_parent_drained_begin(bs);
>      }
>  
> -    bdrv_drain_recurse(bs);
> +    if (!del_stat.deleted) {
> +        QLIST_REMOVE(&del_stat, next);
> +
> +        bdrv_drain_recurse(bs);
> +    }
>  }
>  
>  void bdrv_drained_end(BlockDriverState *bs)
> -- 
> 2.13.5
>
Max Reitz Sept. 18, 2017, 4:13 p.m. UTC | #2
On 2017-09-18 05:44, Fam Zheng wrote:
> On Wed, 09/13 20:18, Max Reitz wrote:
>> Drainined a BDS child may lead to both the original BDS and/or its other
>> children being deleted (e.g. if the original BDS represents a block
>> job).  We should prepare for this in both bdrv_drain_recurse() and
>> bdrv_drained_begin() by monitoring whether the BDS we are about to drain
>> still exists at all.
> 
> Can the deletion happen when IOThread calls
> bdrv_drain_recurse/bdrv_drained_begin?

I don't think so, because (1) my issue was draining a block job and that
can only be completed in the main loop, and (2) I would like to think
it's always impossible, considering that bdrv_unref() may only be called
with the BQL.

>                                         If not, is it enough to do
> 
>     ...
>     if (in_main_loop) {
>         bdrv_ref(bs);
>     }
>     ...
>     if (in_main_loop) {
>         bdrv_unref(bs);
>     }
> 
> to protect the main loop case? So the BdrvDeletedStatus state is not needed.

We already have that in bdrv_drained_recurse(), don't we?

The issue here is, though, that QLIST_FOREACH_SAFE() stores the next
child pointer to @tmp.  However, once the current child @child is
drained, @tmp may no longer be valid -- it may have been detached from
@bs, and it may even have been deleted.

We could work around the latter by increasing the next child's reference
somehow (but BdrvChild doesn't really have a refcount, and in order to
do so, we would probably have to emulate being a parent or
something...), but then you'd still have the issue of @tmp being
detached from the children list we're trying to iterate over.  So
tmp->next is no longer valid.

Anyway, so the latter is the reason why I decided to introduce the bs_list.

But maybe that actually saves us from having to fiddle with BdrvChild...
 Since it's just a list of BDSs now, it may be enough to simply
bdrv_ref() all of the BDSs in that list before draining any of them.  So
 we'd keep creating the bs_list and then we'd move the existing
bdrv_ref() from the drain loop into the loop filling bs_list.

And adding a bdrv_ref()/bdrv_unref() pair to bdrv_drained_begin() should
hopefully work there, too.

Max
Max Reitz Oct. 9, 2017, 6:30 p.m. UTC | #3
On 2017-09-18 18:13, Max Reitz wrote:
> On 2017-09-18 05:44, Fam Zheng wrote:
>> On Wed, 09/13 20:18, Max Reitz wrote:
>>> Drainined a BDS child may lead to both the original BDS and/or its other
>>> children being deleted (e.g. if the original BDS represents a block
>>> job).  We should prepare for this in both bdrv_drain_recurse() and
>>> bdrv_drained_begin() by monitoring whether the BDS we are about to drain
>>> still exists at all.
>>
>> Can the deletion happen when IOThread calls
>> bdrv_drain_recurse/bdrv_drained_begin?
> 
> I don't think so, because (1) my issue was draining a block job and that
> can only be completed in the main loop, and (2) I would like to think
> it's always impossible, considering that bdrv_unref() may only be called
> with the BQL.
> 
>>                                         If not, is it enough to do
>>
>>     ...
>>     if (in_main_loop) {
>>         bdrv_ref(bs);
>>     }
>>     ...
>>     if (in_main_loop) {
>>         bdrv_unref(bs);
>>     }
>>
>> to protect the main loop case? So the BdrvDeletedStatus state is not needed.
> 
> We already have that in bdrv_drained_recurse(), don't we?
> 
> The issue here is, though, that QLIST_FOREACH_SAFE() stores the next
> child pointer to @tmp.  However, once the current child @child is
> drained, @tmp may no longer be valid -- it may have been detached from
> @bs, and it may even have been deleted.
> 
> We could work around the latter by increasing the next child's reference
> somehow (but BdrvChild doesn't really have a refcount, and in order to
> do so, we would probably have to emulate being a parent or
> something...), but then you'd still have the issue of @tmp being
> detached from the children list we're trying to iterate over.  So
> tmp->next is no longer valid.
> 
> Anyway, so the latter is the reason why I decided to introduce the bs_list.
> 
> But maybe that actually saves us from having to fiddle with BdrvChild...
>  Since it's just a list of BDSs now, it may be enough to simply
> bdrv_ref() all of the BDSs in that list before draining any of them.  So
>  we'd keep creating the bs_list and then we'd move the existing
> bdrv_ref() from the drain loop into the loop filling bs_list.
> 
> And adding a bdrv_ref()/bdrv_unref() pair to bdrv_drained_begin() should
> hopefully work there, too.

It turns out it isn't so simple after all... because bdrv_close()
invokes bdrv_drained_begin(). So we may end up with an endless recursion
here.

One way to fix this would be to skip the bdrv_drained_begin() in
bdrv_close() if this would result in such a recursion...  But any
solution that comes quickly to my mind would require another BDS field,
too -- just checking the quiesce_counter is probably not enough because
this might just indicate concurrent drainage that stops before
bdrv_close() wants it to stop.

So maybe BdrvDeletedStatus is the simplest solution after all...?

Max
Kevin Wolf Oct. 10, 2017, 8:36 a.m. UTC | #4
Am 13.09.2017 um 20:18 hat Max Reitz geschrieben:
> Drainined a BDS child may lead to both the original BDS and/or its other
> children being deleted (e.g. if the original BDS represents a block
> job).  We should prepare for this in both bdrv_drain_recurse() and
> bdrv_drained_begin() by monitoring whether the BDS we are about to drain
> still exists at all.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>

How hard would it be to write a test case for this? qemu-iotests
probably isn't the right tool, but I feel a C unit test would be
possible.

> -    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
> -        BlockDriverState *bs = child->bs;
> -        bool in_main_loop =
> -            qemu_get_current_aio_context() == qemu_get_aio_context();
> -        assert(bs->refcnt > 0);

Would it make sense to keep this assertion for the !deleted case?

> -        if (in_main_loop) {
> -            /* In case the recursive bdrv_drain_recurse processes a
> -             * block_job_defer_to_main_loop BH and modifies the graph,
> -             * let's hold a reference to bs until we are done.
> -             *
> -             * IOThread doesn't have such a BH, and it is not safe to call
> -             * bdrv_unref without BQL, so skip doing it there.
> -             */
> -            bdrv_ref(bs);
> -        }
> -        waited |= bdrv_drain_recurse(bs);
> -        if (in_main_loop) {
> -            bdrv_unref(bs);
> +    /* Draining children may result in other children being removed and maybe
> +     * even deleted, so copy the children list first */

Maybe it's just me, but I failed to understand this correctly at first.
How about "being removed from their parent" to clarify that it's not the
BDS that is removed, but just the reference?

Kevin
Max Reitz Oct. 11, 2017, 11:41 a.m. UTC | #5
On 2017-10-10 10:36, Kevin Wolf wrote:
> Am 13.09.2017 um 20:18 hat Max Reitz geschrieben:
>> Drainined a BDS child may lead to both the original BDS and/or its other
>> children being deleted (e.g. if the original BDS represents a block
>> job).  We should prepare for this in both bdrv_drain_recurse() and
>> bdrv_drained_begin() by monitoring whether the BDS we are about to drain
>> still exists at all.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
> 
> How hard would it be to write a test case for this? qemu-iotests
> probably isn't the right tool, but I feel a C unit test would be
> possible.

I can look into it, but I can't promise anything.

>> -    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
>> -        BlockDriverState *bs = child->bs;
>> -        bool in_main_loop =
>> -            qemu_get_current_aio_context() == qemu_get_aio_context();
>> -        assert(bs->refcnt > 0);
> 
> Would it make sense to keep this assertion for the !deleted case?

Sure, why not.

>> -        if (in_main_loop) {
>> -            /* In case the recursive bdrv_drain_recurse processes a
>> -             * block_job_defer_to_main_loop BH and modifies the graph,
>> -             * let's hold a reference to bs until we are done.
>> -             *
>> -             * IOThread doesn't have such a BH, and it is not safe to call
>> -             * bdrv_unref without BQL, so skip doing it there.
>> -             */
>> -            bdrv_ref(bs);
>> -        }
>> -        waited |= bdrv_drain_recurse(bs);
>> -        if (in_main_loop) {
>> -            bdrv_unref(bs);
>> +    /* Draining children may result in other children being removed and maybe
>> +     * even deleted, so copy the children list first */
> 
> Maybe it's just me, but I failed to understand this correctly at first.
> How about "being removed from their parent" to clarify that it's not the
> BDS that is removed, but just the reference?

Well, it's the BdrvChild that's removed, that's what I meant by
"children".  But then the comment speaks of "children list" and means
creation of a list of BDSs, sooo...  Yes, some change necessary.

Max
diff mbox series

Patch

diff --git a/block/io.c b/block/io.c
index 4378ae4c7d..8ec1a564ad 100644
--- a/block/io.c
+++ b/block/io.c
@@ -182,33 +182,57 @@  static void bdrv_drain_invoke(BlockDriverState *bs)
 
 static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
-    BdrvChild *child, *tmp;
+    BdrvChild *child;
     bool waited;
+    struct BDSToDrain {
+        BlockDriverState *bs;
+        BdrvDeletedStatus del_stat;
+        QLIST_ENTRY(BDSToDrain) next;
+    };
+    QLIST_HEAD(, BDSToDrain) bs_list = QLIST_HEAD_INITIALIZER(bs_list);
+    bool in_main_loop =
+        qemu_get_current_aio_context() == qemu_get_aio_context();
 
     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 
     /* Ensure any pending metadata writes are submitted to bs->file.  */
     bdrv_drain_invoke(bs);
 
-    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
-        BlockDriverState *bs = child->bs;
-        bool in_main_loop =
-            qemu_get_current_aio_context() == qemu_get_aio_context();
-        assert(bs->refcnt > 0);
-        if (in_main_loop) {
-            /* In case the recursive bdrv_drain_recurse processes a
-             * block_job_defer_to_main_loop BH and modifies the graph,
-             * let's hold a reference to bs until we are done.
-             *
-             * IOThread doesn't have such a BH, and it is not safe to call
-             * bdrv_unref without BQL, so skip doing it there.
-             */
-            bdrv_ref(bs);
-        }
-        waited |= bdrv_drain_recurse(bs);
-        if (in_main_loop) {
-            bdrv_unref(bs);
+    /* Draining children may result in other children being removed and maybe
+     * even deleted, so copy the children list first */
+    QLIST_FOREACH(child, &bs->children, next) {
+        struct BDSToDrain *bs2d = g_new0(struct BDSToDrain, 1);
+
+        bs2d->bs = child->bs;
+        QLIST_INSERT_HEAD(&bs->deleted_status, &bs2d->del_stat, next);
+
+        QLIST_INSERT_HEAD(&bs_list, bs2d, next);
+    }
+
+    while (!QLIST_EMPTY(&bs_list)) {
+        struct BDSToDrain *bs2d = QLIST_FIRST(&bs_list);
+        QLIST_REMOVE(bs2d, next);
+
+        if (!bs2d->del_stat.deleted) {
+            QLIST_REMOVE(&bs2d->del_stat, next);
+
+            if (in_main_loop) {
+                /* In case the recursive bdrv_drain_recurse processes a
+                 * block_job_defer_to_main_loop BH and modifies the graph,
+                 * let's hold a reference to the BDS until we are done.
+                 *
+                 * IOThread doesn't have such a BH, and it is not safe to call
+                 * bdrv_unref without BQL, so skip doing it there.
+                 */
+                bdrv_ref(bs2d->bs);
+            }
+            waited |= bdrv_drain_recurse(bs2d->bs);
+            if (in_main_loop) {
+                bdrv_unref(bs2d->bs);
+            }
         }
+
+        g_free(bs2d);
     }
 
     return waited;
@@ -252,17 +276,25 @@  static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
+    BdrvDeletedStatus del_stat = { .deleted = false };
+
     if (qemu_in_coroutine()) {
         bdrv_co_yield_to_drain(bs);
         return;
     }
 
+    QLIST_INSERT_HEAD(&bs->deleted_status, &del_stat, next);
+
     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
         aio_disable_external(bdrv_get_aio_context(bs));
         bdrv_parent_drained_begin(bs);
     }
 
-    bdrv_drain_recurse(bs);
+    if (!del_stat.deleted) {
+        QLIST_REMOVE(&del_stat, next);
+
+        bdrv_drain_recurse(bs);
+    }
 }
 
 void bdrv_drained_end(BlockDriverState *bs)