diff mbox series

[for-3.1,1/2] block: Don't inactivate children before parents

Message ID 20181126112828.23047-2-kwolf@redhat.com
State New
Headers show
Series block: Fix crash on migration with explicit child nodes | expand

Commit Message

Kevin Wolf Nov. 26, 2018, 11:28 a.m. UTC
bdrv_child_cb_inactivate() asserts that parents are already inactive
when children get inactivated. This precondition is necessary because
parents could still issue requests in their inactivation code.

When block nodes are created individually with -blockdev, all of them
are monitor owned and will be returned by bdrv_next() in an undefined
order (in practice, in the order of their creation, which is usually
children before parents), which obviously fails the assertion.

This patch fixes the ordering by skipping nodes with still active
parents in bdrv_inactivate_recurse() because we know that they will be
covered by recursion when the last active parent becomes inactive.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

Comments

Max Reitz Nov. 26, 2018, 12:05 p.m. UTC | #1
On 26.11.18 12:28, Kevin Wolf wrote:
> bdrv_child_cb_inactivate() asserts that parents are already inactive
> when children get inactivated. This precondition is necessary because
> parents could still issue requests in their inactivation code.
> 
> When block nodes are created individually with -blockdev, all of them
> are monitor owned and will be returned by bdrv_next() in an undefined
> order (in practice, in the order of their creation, which is usually
> children before parents), which obviously fails the assertion.
> 
> This patch fixes the ordering by skipping nodes with still active
> parents in bdrv_inactivate_recurse() because we know that they will be
> covered by recursion when the last active parent becomes inactive.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  block.c | 22 ++++++++++++++++++++++
>  1 file changed, 22 insertions(+)
> 
> diff --git a/block.c b/block.c
> index 5ba3435f8f..0569275e31 100644
> --- a/block.c
> +++ b/block.c
> @@ -4612,6 +4612,22 @@ void bdrv_invalidate_cache_all(Error **errp)
>      }
>  }
>  
> +static bool bdrv_has_active_bds_parent(BlockDriverState *bs)
> +{
> +    BdrvChild *parent;
> +
> +    QLIST_FOREACH(parent, &bs->parents, next_parent) {
> +        if (parent->role->parent_is_bds) {
> +            BlockDriverState *parent_bs = parent->opaque;
> +            if (!(parent_bs->open_flags & BDRV_O_INACTIVE)) {
> +                return true;
> +            }
> +        }
> +    }

Now I see why you say this might make sense as a permission.

> +
> +    return false;
> +}
> +
>  static int bdrv_inactivate_recurse(BlockDriverState *bs,
>                                     bool setting_flag)
>  {
> @@ -4622,6 +4638,12 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
>          return -ENOMEDIUM;
>      }
>  
> +    /* Make sure that we don't inactivate a child before its parent.
> +     * It will be covered by recursion from the yet active parent. */
> +    if (bdrv_has_active_bds_parent(bs)) {
> +        return 0;
> +    }
> +

Hm.  Wouldn't it make more sense to always return early when there are
any BDS parents?  Because if there are any BDS parents and none of them
are active (anymore), then this child will have been inactivated
already, and we can save ourselves the trouble of going through the rest
of the function again.

Do drivers support multiple calls to .bdrv_in_activate() at all?

Max

>      if (!setting_flag && bs->drv->bdrv_inactivate) {
>          ret = bs->drv->bdrv_inactivate(bs);
>          if (ret < 0) {
>
Kevin Wolf Nov. 26, 2018, 12:33 p.m. UTC | #2
Am 26.11.2018 um 13:05 hat Max Reitz geschrieben:
> On 26.11.18 12:28, Kevin Wolf wrote:
> > bdrv_child_cb_inactivate() asserts that parents are already inactive
> > when children get inactivated. This precondition is necessary because
> > parents could still issue requests in their inactivation code.
> > 
> > When block nodes are created individually with -blockdev, all of them
> > are monitor owned and will be returned by bdrv_next() in an undefined
> > order (in practice, in the order of their creation, which is usually
> > children before parents), which obviously fails the assertion.
> > 
> > This patch fixes the ordering by skipping nodes with still active
> > parents in bdrv_inactivate_recurse() because we know that they will be
> > covered by recursion when the last active parent becomes inactive.
> > 
> > Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> > ---
> >  block.c | 22 ++++++++++++++++++++++
> >  1 file changed, 22 insertions(+)
> > 
> > diff --git a/block.c b/block.c
> > index 5ba3435f8f..0569275e31 100644
> > --- a/block.c
> > +++ b/block.c
> > @@ -4612,6 +4612,22 @@ void bdrv_invalidate_cache_all(Error **errp)
> >      }
> >  }
> >  
> > +static bool bdrv_has_active_bds_parent(BlockDriverState *bs)
> > +{
> > +    BdrvChild *parent;
> > +
> > +    QLIST_FOREACH(parent, &bs->parents, next_parent) {
> > +        if (parent->role->parent_is_bds) {
> > +            BlockDriverState *parent_bs = parent->opaque;
> > +            if (!(parent_bs->open_flags & BDRV_O_INACTIVE)) {
> > +                return true;
> > +            }
> > +        }
> > +    }
> 
> Now I see why you say this might make sense as a permission.

You do? To be honest, now that I found this solution, I don't think a
permission makes much sense any more, because you would have the same
loop, and you would only be checking a different flag in the end.

> > +
> > +    return false;
> > +}
> > +
> >  static int bdrv_inactivate_recurse(BlockDriverState *bs,
> >                                     bool setting_flag)
> >  {
> > @@ -4622,6 +4638,12 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
> >          return -ENOMEDIUM;
> >      }
> >  
> > +    /* Make sure that we don't inactivate a child before its parent.
> > +     * It will be covered by recursion from the yet active parent. */
> > +    if (bdrv_has_active_bds_parent(bs)) {
> > +        return 0;
> > +    }
> > +
> 
> Hm.  Wouldn't it make more sense to always return early when there are
> any BDS parents?  Because if there are any BDS parents and none of them
> are active (anymore), then this child will have been inactivated
> already, and we can save ourselves the trouble of going through the rest
> of the function again.

I don't quite follow... If we always return early no matter whether
there is an active parent, who will have inactivated the child?

After trying to write up why you're wrong, I think there are two cases
and both of us have a point:

1. bdrv_next() enumerates the child node first and then the last BDS
   parent. This is what this patch fixes.

   It will inactivate the child exactly once, at the time that the last
   parent has become inactive (and recursively calls this function for
   each of its children). If you remove that one inactivation, too,
   children won't be inactivated at all.

2. bdrv_next() enumerates the last BDS parent first and then the child.
   This is unlikely and might even be impossible today, but once we
   expose bdrv_reopen() in QMP and let the user reconfigure the edges,
   it probably becomes possible.

   In this case, even after my patch we inactivate drivers twice. Maybe
   we should just return early if BDRV_O_INACTIVE is already set. What
   makes me kind of unsure is that we already test for this flag, but
   only for part of the function.

   Any ideas how to test this? Can we create such a situation today?

> Do drivers support multiple calls to .bdrv_in_activate() at all?

They were probably not designed for that... Not sure if there was ever a
commit where we used to call them multiple times without failing the
assertion first.

Kevin
Max Reitz Nov. 26, 2018, 12:40 p.m. UTC | #3
On 26.11.18 13:05, Max Reitz wrote:
> On 26.11.18 12:28, Kevin Wolf wrote:
>> bdrv_child_cb_inactivate() asserts that parents are already inactive
>> when children get inactivated. This precondition is necessary because
>> parents could still issue requests in their inactivation code.
>>
>> When block nodes are created individually with -blockdev, all of them
>> are monitor owned and will be returned by bdrv_next() in an undefined
>> order (in practice, in the order of their creation, which is usually
>> children before parents), which obviously fails the assertion.
>>
>> This patch fixes the ordering by skipping nodes with still active
>> parents in bdrv_inactivate_recurse() because we know that they will be
>> covered by recursion when the last active parent becomes inactive.
>>
>> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
>> ---
>>  block.c | 22 ++++++++++++++++++++++
>>  1 file changed, 22 insertions(+)
>>
>> diff --git a/block.c b/block.c
>> index 5ba3435f8f..0569275e31 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -4612,6 +4612,22 @@ void bdrv_invalidate_cache_all(Error **errp)
>>      }
>>  }
>>  
>> +static bool bdrv_has_active_bds_parent(BlockDriverState *bs)
>> +{
>> +    BdrvChild *parent;
>> +
>> +    QLIST_FOREACH(parent, &bs->parents, next_parent) {
>> +        if (parent->role->parent_is_bds) {
>> +            BlockDriverState *parent_bs = parent->opaque;
>> +            if (!(parent_bs->open_flags & BDRV_O_INACTIVE)) {
>> +                return true;
>> +            }
>> +        }
>> +    }
> 
> Now I see why you say this might make sense as a permission.
> 
>> +
>> +    return false;
>> +}
>> +
>>  static int bdrv_inactivate_recurse(BlockDriverState *bs,
>>                                     bool setting_flag)
>>  {
>> @@ -4622,6 +4638,12 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
>>          return -ENOMEDIUM;
>>      }
>>  
>> +    /* Make sure that we don't inactivate a child before its parent.
>> +     * It will be covered by recursion from the yet active parent. */
>> +    if (bdrv_has_active_bds_parent(bs)) {

Another thing I found while testing: I think this should only return
early if setting_flag is true.  BDRV_O_INACTIVE will only be set on the
second pass.  If you skip nodes with active parents unconditionally,
bs->drv->bdrv_inactivate() will not be called for any non-root BDS
(because bdrv_has_active_bds_parents() returns true for all non-root
BDSs during the first pass).

>> +        return 0;
>> +    }
>> +
> 
> Hm.  Wouldn't it make more sense to always return early when there are
> any BDS parents?  Because if there are any BDS parents and none of them
> are active (anymore), then this child will have been inactivated
> already, and we can save ourselves the trouble of going through the rest
> of the function again.

Hm, well, no, at least not directly here.  (Otherwise
bdrv_inactive_recurse() will not really recurse when it calls itself...)
But bdrv_inactive_all() could check that before invoking this function.

Ah, but then it is possible to still run into the exact bug you're
fixing here, because a node might inactivate a child that has another
parent still (which is inactivated later).

But still, I think bdrv_inactive_all() should skip non-root BDSs,
because calling bs->drv->bdrv_inactive() and parent->role->inactivate()
multiple times cannot be right.

Max

> Do drivers support multiple calls to .bdrv_in_activate() at all?
> 
> Max
> 
>>      if (!setting_flag && bs->drv->bdrv_inactivate) {
>>          ret = bs->drv->bdrv_inactivate(bs);
>>          if (ret < 0) {
>>
> 
>
Max Reitz Nov. 26, 2018, 12:44 p.m. UTC | #4
On 26.11.18 13:33, Kevin Wolf wrote:
> Am 26.11.2018 um 13:05 hat Max Reitz geschrieben:
>> On 26.11.18 12:28, Kevin Wolf wrote:
>>> bdrv_child_cb_inactivate() asserts that parents are already inactive
>>> when children get inactivated. This precondition is necessary because
>>> parents could still issue requests in their inactivation code.
>>>
>>> When block nodes are created individually with -blockdev, all of them
>>> are monitor owned and will be returned by bdrv_next() in an undefined
>>> order (in practice, in the order of their creation, which is usually
>>> children before parents), which obviously fails the assertion.
>>>
>>> This patch fixes the ordering by skipping nodes with still active
>>> parents in bdrv_inactivate_recurse() because we know that they will be
>>> covered by recursion when the last active parent becomes inactive.
>>>
>>> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
>>> ---
>>>  block.c | 22 ++++++++++++++++++++++
>>>  1 file changed, 22 insertions(+)
>>>
>>> diff --git a/block.c b/block.c
>>> index 5ba3435f8f..0569275e31 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -4612,6 +4612,22 @@ void bdrv_invalidate_cache_all(Error **errp)
>>>      }
>>>  }
>>>  
>>> +static bool bdrv_has_active_bds_parent(BlockDriverState *bs)
>>> +{
>>> +    BdrvChild *parent;
>>> +
>>> +    QLIST_FOREACH(parent, &bs->parents, next_parent) {
>>> +        if (parent->role->parent_is_bds) {
>>> +            BlockDriverState *parent_bs = parent->opaque;
>>> +            if (!(parent_bs->open_flags & BDRV_O_INACTIVE)) {
>>> +                return true;
>>> +            }
>>> +        }
>>> +    }
>>
>> Now I see why you say this might make sense as a permission.
> 
> You do? To be honest, now that I found this solution, I don't think a
> permission makes much sense any more, because you would have the same
> loop, and you would only be checking a different flag in the end.
> 
>>> +
>>> +    return false;
>>> +}
>>> +
>>>  static int bdrv_inactivate_recurse(BlockDriverState *bs,
>>>                                     bool setting_flag)
>>>  {
>>> @@ -4622,6 +4638,12 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
>>>          return -ENOMEDIUM;
>>>      }
>>>  
>>> +    /* Make sure that we don't inactivate a child before its parent.
>>> +     * It will be covered by recursion from the yet active parent. */
>>> +    if (bdrv_has_active_bds_parent(bs)) {
>>> +        return 0;
>>> +    }
>>> +
>>
>> Hm.  Wouldn't it make more sense to always return early when there are
>> any BDS parents?  Because if there are any BDS parents and none of them
>> are active (anymore), then this child will have been inactivated
>> already, and we can save ourselves the trouble of going through the rest
>> of the function again.
> 
> I don't quite follow... If we always return early no matter whether
> there is an active parent, who will have inactivated the child?
> 
> After trying to write up why you're wrong, I think there are two cases
> and both of us have a point:
> 
> 1. bdrv_next() enumerates the child node first and then the last BDS
>    parent. This is what this patch fixes.
> 
>    It will inactivate the child exactly once, at the time that the last
>    parent has become inactive (and recursively calls this function for
>    each of its children). If you remove that one inactivation, too,
>    children won't be inactivated at all.
> 
> 2. bdrv_next() enumerates the last BDS parent first and then the child.
>    This is unlikely and might even be impossible today, but once we
>    expose bdrv_reopen() in QMP and let the user reconfigure the edges,
>    it probably becomes possible.

blockdev-snapshot exists today.

>    In this case, even after my patch we inactivate drivers twice. Maybe
>    we should just return early if BDRV_O_INACTIVE is already set. What
>    makes me kind of unsure is that we already test for this flag, but
>    only for part of the function.
> 
>    Any ideas how to test this? Can we create such a situation today?

As I wrote in my second mail just now, I think bdrv_inactivate_all()
needs to check this.

See attached diff to 234, but I don't know whether we can really test
this, as there is no failure when .bdrv_inactivate() is called multiple
times.  (What I've done is add debugging code to see that it is called
multiple times).

Max

>> Do drivers support multiple calls to .bdrv_in_activate() at all?
> 
> They were probably not designed for that... Not sure if there was ever a
> commit where we used to call them multiple times without failing the
> assertion first.
> 
> Kevin
>
diff --git a/tests/qemu-iotests/234 b/tests/qemu-iotests/234
index 1f695d337a..b119b4cc4d 100755
--- a/tests/qemu-iotests/234
+++ b/tests/qemu-iotests/234
@@ -27,11 +27,13 @@ iotests.verify_image_format(supported_fmts=['raw', 'qcow2'])
 iotests.verify_platform(['linux'])
 
 with iotests.FilePath('img') as img_path, \
+     iotests.FilePath('backing') as backing_path, \
      iotests.FilePath('mig_fifo_a') as fifo_a, \
      iotests.FilePath('mig_fifo_b') as fifo_b, \
      iotests.VM(path_suffix='a') as vm_a, \
      iotests.VM(path_suffix='b') as vm_b:
 
+    iotests.qemu_img_pipe('create', '-f', iotests.imgfmt, backing_path, '64M')
     iotests.qemu_img_pipe('create', '-f', iotests.imgfmt, img_path, '64M')
 
     os.mkfifo(fifo_a)
@@ -40,14 +42,20 @@ with iotests.FilePath('img') as img_path, \
     iotests.log('Launching source VM...')
     (vm_a.add_blockdev('file,filename=%s,node-name=drive0-file' % (img_path))
          .add_blockdev('%s,file=drive0-file,node-name=drive0' % (iotests.imgfmt))
+         .add_blockdev('file,filename=%s,node-name=drive0-backing-file' % (backing_path))
+         .add_blockdev('%s,file=drive0-backing-file,node-name=drive0-backing' % (iotests.imgfmt))
          .launch())
 
     iotests.log('Launching destination VM...')
     (vm_b.add_blockdev('file,filename=%s,node-name=drive0-file' % (img_path))
          .add_blockdev('%s,file=drive0-file,node-name=drive0' % (iotests.imgfmt))
+         .add_blockdev('file,filename=%s,node-name=drive0-backing-file' % (backing_path))
+         .add_blockdev('%s,file=drive0-backing-file,node-name=drive0-backing' % (iotests.imgfmt))
          .add_incoming("exec: cat '%s'" % (fifo_a))
          .launch())
 
+    vm_a.qmp('blockdev-snapshot', node='drive0-backing', overlay='drive0')
+
     iotests.log('Enabling migration QMP events on A...')
     iotests.log(vm_a.qmp('migrate-set-capabilities', capabilities=[
         {
diff mbox series

Patch

diff --git a/block.c b/block.c
index 5ba3435f8f..0569275e31 100644
--- a/block.c
+++ b/block.c
@@ -4612,6 +4612,22 @@  void bdrv_invalidate_cache_all(Error **errp)
     }
 }
 
+static bool bdrv_has_active_bds_parent(BlockDriverState *bs)
+{
+    BdrvChild *parent;
+
+    QLIST_FOREACH(parent, &bs->parents, next_parent) {
+        if (parent->role->parent_is_bds) {
+            BlockDriverState *parent_bs = parent->opaque;
+            if (!(parent_bs->open_flags & BDRV_O_INACTIVE)) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
 static int bdrv_inactivate_recurse(BlockDriverState *bs,
                                    bool setting_flag)
 {
@@ -4622,6 +4638,12 @@  static int bdrv_inactivate_recurse(BlockDriverState *bs,
         return -ENOMEDIUM;
     }
 
+    /* Make sure that we don't inactivate a child before its parent.
+     * It will be covered by recursion from the yet active parent. */
+    if (bdrv_has_active_bds_parent(bs)) {
+        return 0;
+    }
+
     if (!setting_flag && bs->drv->bdrv_inactivate) {
         ret = bs->drv->bdrv_inactivate(bs);
         if (ret < 0) {