diff mbox series

[for-2.11] block: Keep strong reference when draining all BDS

Message ID 20171109204315.27072-1-mreitz@redhat.com
State New
Headers show
Series [for-2.11] block: Keep strong reference when draining all BDS | expand

Commit Message

Max Reitz Nov. 9, 2017, 8:43 p.m. UTC
Draining a BDS may lead to graph modifications, which in turn may result
in it and other BDS being stripped of their current references.  If
bdrv_drain_all_begin() and bdrv_drain_all_end() do not keep strong
references themselves, the BDS they are trying to drain (or undrain) may
disappear right under their feet -- or, more specifically, under the
feet of BDRV_POLL_WHILE() in bdrv_drain_recurse().

This fixes an occasional hang of iotest 194.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/io.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

Comments

Eric Blake Nov. 9, 2017, 9:02 p.m. UTC | #1
On 11/09/2017 02:43 PM, Max Reitz wrote:
> Draining a BDS may lead to graph modifications, which in turn may result
> in it and other BDS being stripped of their current references.  If
> bdrv_drain_all_begin() and bdrv_drain_all_end() do not keep strong
> references themselves, the BDS they are trying to drain (or undrain) may
> disappear right under their feet -- or, more specifically, under the
> feet of BDRV_POLL_WHILE() in bdrv_drain_recurse().
> 
> This fixes an occasional hang of iotest 194.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/io.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 44 insertions(+), 3 deletions(-)

> +
> +        /* Keep a strong reference to all root BDS and copy them into
> +         * an own list because draining them may lead to graph

'an own' sounds awkward; maybe 'copy them into a local list'

> +         * modifications. */
> +        bdrv_ref(bs);
> +        bs_list = g_slist_prepend(bs_list, bs);
>      }

>  void bdrv_drain_all_end(void)
>  {
>      BlockDriverState *bs;
>      BdrvNextIterator it;
> +    GSList *bs_list = NULL, *bs_list_entry;
> +
> +    /* Must be called from the main loop */
> +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
>  
> +    /* Keep a strong reference to all root BDS and copy them into an
> +     * own list because draining them may lead to graph modifications.

And again.

With that tweak,
Reviewed-by: Eric Blake <eblake@redhat.com>
Fam Zheng Nov. 10, 2017, 2:45 a.m. UTC | #2
On Thu, 11/09 21:43, Max Reitz wrote:
> Draining a BDS may lead to graph modifications, which in turn may result
> in it and other BDS being stripped of their current references.  If
> bdrv_drain_all_begin() and bdrv_drain_all_end() do not keep strong
> references themselves, the BDS they are trying to drain (or undrain) may
> disappear right under their feet -- or, more specifically, under the
> feet of BDRV_POLL_WHILE() in bdrv_drain_recurse().
> 
> This fixes an occasional hang of iotest 194.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/io.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 44 insertions(+), 3 deletions(-)
> 
> diff --git a/block/io.c b/block/io.c
> index 3d5ef2cabe..a0a2833e8e 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -340,7 +340,10 @@ void bdrv_drain_all_begin(void)
>      bool waited = true;
>      BlockDriverState *bs;
>      BdrvNextIterator it;
> -    GSList *aio_ctxs = NULL, *ctx;
> +    GSList *aio_ctxs = NULL, *ctx, *bs_list = NULL, *bs_list_entry;
> +
> +    /* Must be called from the main loop */
> +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
>  
>      block_job_pause_all();
>  
> @@ -355,6 +358,12 @@ void bdrv_drain_all_begin(void)
>          if (!g_slist_find(aio_ctxs, aio_context)) {
>              aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
>          }
> +
> +        /* Keep a strong reference to all root BDS and copy them into
> +         * an own list because draining them may lead to graph
> +         * modifications. */
> +        bdrv_ref(bs);
> +        bs_list = g_slist_prepend(bs_list, bs);
>      }
>  
>      /* Note that completion of an asynchronous I/O operation can trigger any
> @@ -370,7 +379,11 @@ void bdrv_drain_all_begin(void)
>              AioContext *aio_context = ctx->data;
>  
>              aio_context_acquire(aio_context);
> -            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> +            for (bs_list_entry = bs_list; bs_list_entry;
> +                 bs_list_entry = bs_list_entry->next)
> +            {
> +                bs = bs_list_entry->data;
> +
>                  if (aio_context == bdrv_get_aio_context(bs)) {
>                      waited |= bdrv_drain_recurse(bs, true);
>                  }
> @@ -379,24 +392,52 @@ void bdrv_drain_all_begin(void)
>          }
>      }
>  
> +    for (bs_list_entry = bs_list; bs_list_entry;
> +         bs_list_entry = bs_list_entry->next)
> +    {
> +        bdrv_unref(bs_list_entry->data);
> +    }
> +
>      g_slist_free(aio_ctxs);
> +    g_slist_free(bs_list);
>  }
>  
>  void bdrv_drain_all_end(void)
>  {
>      BlockDriverState *bs;
>      BdrvNextIterator it;
> +    GSList *bs_list = NULL, *bs_list_entry;
> +
> +    /* Must be called from the main loop */
> +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
>  
> +    /* Keep a strong reference to all root BDS and copy them into an
> +     * own list because draining them may lead to graph modifications.
> +     */
>      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> -        AioContext *aio_context = bdrv_get_aio_context(bs);
> +        bdrv_ref(bs);
> +        bs_list = g_slist_prepend(bs_list, bs);
> +    }
> +
> +    for (bs_list_entry = bs_list; bs_list_entry;
> +         bs_list_entry = bs_list_entry->next)
> +    {
> +        AioContext *aio_context;
> +
> +        bs = bs_list_entry->data;
> +        aio_context = bdrv_get_aio_context(bs);
>  
>          aio_context_acquire(aio_context);
>          aio_enable_external(aio_context);
>          bdrv_parent_drained_end(bs);
>          bdrv_drain_recurse(bs, false);
>          aio_context_release(aio_context);
> +
> +        bdrv_unref(bs);
>      }
>  
> +    g_slist_free(bs_list);
> +
>      block_job_resume_all();
>  }
>  
> -- 
> 2.13.6
> 
> 

It is better to put the references into BdrvNextIterator and introduce
bdrv_next_iterator_destroy() to free them? You'll need to touch all callers
because it is not C++, but it secures all of rest, which seems vulnerable in the
same pattern, for example the aio_poll() in iothread_stop_all().

Fam
Stefan Hajnoczi Nov. 10, 2017, 9:19 a.m. UTC | #3
On Thu, Nov 09, 2017 at 09:43:15PM +0100, Max Reitz wrote:
> Draining a BDS may lead to graph modifications, which in turn may result
> in it and other BDS being stripped of their current references.  If
> bdrv_drain_all_begin() and bdrv_drain_all_end() do not keep strong
> references themselves, the BDS they are trying to drain (or undrain) may
> disappear right under their feet -- or, more specifically, under the
> feet of BDRV_POLL_WHILE() in bdrv_drain_recurse().
> 
> This fixes an occasional hang of iotest 194.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/io.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 44 insertions(+), 3 deletions(-)
> 
> diff --git a/block/io.c b/block/io.c
> index 3d5ef2cabe..a0a2833e8e 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -340,7 +340,10 @@ void bdrv_drain_all_begin(void)
>      bool waited = true;
>      BlockDriverState *bs;
>      BdrvNextIterator it;
> -    GSList *aio_ctxs = NULL, *ctx;
> +    GSList *aio_ctxs = NULL, *ctx, *bs_list = NULL, *bs_list_entry;
> +
> +    /* Must be called from the main loop */
> +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
>  
>      block_job_pause_all();
>  
> @@ -355,6 +358,12 @@ void bdrv_drain_all_begin(void)
>          if (!g_slist_find(aio_ctxs, aio_context)) {
>              aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
>          }
> +
> +        /* Keep a strong reference to all root BDS and copy them into
> +         * an own list because draining them may lead to graph
> +         * modifications. */
> +        bdrv_ref(bs);
> +        bs_list = g_slist_prepend(bs_list, bs);
>      }
>  
>      /* Note that completion of an asynchronous I/O operation can trigger any
> @@ -370,7 +379,11 @@ void bdrv_drain_all_begin(void)
>              AioContext *aio_context = ctx->data;
>  
>              aio_context_acquire(aio_context);
> -            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> +            for (bs_list_entry = bs_list; bs_list_entry;
> +                 bs_list_entry = bs_list_entry->next)
> +            {
> +                bs = bs_list_entry->data;
> +
>                  if (aio_context == bdrv_get_aio_context(bs)) {
>                      waited |= bdrv_drain_recurse(bs, true);
>                  }
> @@ -379,24 +392,52 @@ void bdrv_drain_all_begin(void)
>          }
>      }
>  
> +    for (bs_list_entry = bs_list; bs_list_entry;
> +         bs_list_entry = bs_list_entry->next)
> +    {
> +        bdrv_unref(bs_list_entry->data);
> +    }
> +
>      g_slist_free(aio_ctxs);
> +    g_slist_free(bs_list);
>  }

Which specific parts of this function access bs without a reference?

I see bdrv_next() may do QTAILQ_NEXT(bs, monitor_list) after
bdrv_drain_recurse() has returned.

Anything else?

If bdrv_next() is the only issue then I agree with Fam that it makes
sense to build the ref/unref into bdrv_next().
Kevin Wolf Nov. 10, 2017, 1:17 p.m. UTC | #4
Am 10.11.2017 um 03:45 hat Fam Zheng geschrieben:
> On Thu, 11/09 21:43, Max Reitz wrote:
> > Draining a BDS may lead to graph modifications, which in turn may result
> > in it and other BDS being stripped of their current references.  If
> > bdrv_drain_all_begin() and bdrv_drain_all_end() do not keep strong
> > references themselves, the BDS they are trying to drain (or undrain) may
> > disappear right under their feet -- or, more specifically, under the
> > feet of BDRV_POLL_WHILE() in bdrv_drain_recurse().
> > 
> > This fixes an occasional hang of iotest 194.
> > 
> > Signed-off-by: Max Reitz <mreitz@redhat.com>
> > ---
> >  block/io.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 44 insertions(+), 3 deletions(-)
> > 
> > diff --git a/block/io.c b/block/io.c
> > index 3d5ef2cabe..a0a2833e8e 100644
> > --- a/block/io.c
> > +++ b/block/io.c
> > @@ -340,7 +340,10 @@ void bdrv_drain_all_begin(void)
> >      bool waited = true;
> >      BlockDriverState *bs;
> >      BdrvNextIterator it;
> > -    GSList *aio_ctxs = NULL, *ctx;
> > +    GSList *aio_ctxs = NULL, *ctx, *bs_list = NULL, *bs_list_entry;
> > +
> > +    /* Must be called from the main loop */
> > +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
> >  
> >      block_job_pause_all();
> >  
> > @@ -355,6 +358,12 @@ void bdrv_drain_all_begin(void)
> >          if (!g_slist_find(aio_ctxs, aio_context)) {
> >              aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
> >          }
> > +
> > +        /* Keep a strong reference to all root BDS and copy them into
> > +         * an own list because draining them may lead to graph
> > +         * modifications. */
> > +        bdrv_ref(bs);
> > +        bs_list = g_slist_prepend(bs_list, bs);
> >      }
> >  
> >      /* Note that completion of an asynchronous I/O operation can trigger any
> > @@ -370,7 +379,11 @@ void bdrv_drain_all_begin(void)
> >              AioContext *aio_context = ctx->data;
> >  
> >              aio_context_acquire(aio_context);
> > -            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> > +            for (bs_list_entry = bs_list; bs_list_entry;
> > +                 bs_list_entry = bs_list_entry->next)
> > +            {
> > +                bs = bs_list_entry->data;
> > +
> >                  if (aio_context == bdrv_get_aio_context(bs)) {
> >                      waited |= bdrv_drain_recurse(bs, true);
> >                  }
> > @@ -379,24 +392,52 @@ void bdrv_drain_all_begin(void)
> >          }
> >      }
> >  
> > +    for (bs_list_entry = bs_list; bs_list_entry;
> > +         bs_list_entry = bs_list_entry->next)
> > +    {
> > +        bdrv_unref(bs_list_entry->data);
> > +    }
> > +
> >      g_slist_free(aio_ctxs);
> > +    g_slist_free(bs_list);
> >  }
> >  
> >  void bdrv_drain_all_end(void)
> >  {
> >      BlockDriverState *bs;
> >      BdrvNextIterator it;
> > +    GSList *bs_list = NULL, *bs_list_entry;
> > +
> > +    /* Must be called from the main loop */
> > +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
> >  
> > +    /* Keep a strong reference to all root BDS and copy them into an
> > +     * own list because draining them may lead to graph modifications.
> > +     */
> >      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> > -        AioContext *aio_context = bdrv_get_aio_context(bs);
> > +        bdrv_ref(bs);
> > +        bs_list = g_slist_prepend(bs_list, bs);
> > +    }
> > +
> > +    for (bs_list_entry = bs_list; bs_list_entry;
> > +         bs_list_entry = bs_list_entry->next)
> > +    {
> > +        AioContext *aio_context;
> > +
> > +        bs = bs_list_entry->data;
> > +        aio_context = bdrv_get_aio_context(bs);
> >  
> >          aio_context_acquire(aio_context);
> >          aio_enable_external(aio_context);
> >          bdrv_parent_drained_end(bs);
> >          bdrv_drain_recurse(bs, false);
> >          aio_context_release(aio_context);
> > +
> > +        bdrv_unref(bs);
> >      }
> >  
> > +    g_slist_free(bs_list);
> > +
> >      block_job_resume_all();
> >  }
> 
> It is better to put the references into BdrvNextIterator and introduce
> bdrv_next_iterator_destroy() to free them? You'll need to touch all callers
> because it is not C++, but it secures all of rest, which seems vulnerable in the
> same pattern, for example the aio_poll() in iothread_stop_all().

You could automatically free the references when bdrv_next() returns
NULL. Then you need an explicit bdrv_next_iterator_destroy() only for
callers that stop iterating halfway through the list.

Do you actually need to keep references to all BDSes in the whole list
while using the iterator or would it be enough to just keep a reference
to the current one?

Kevin
Fam Zheng Nov. 10, 2017, 1:32 p.m. UTC | #5
On Fri, 11/10 14:17, Kevin Wolf wrote:
> Am 10.11.2017 um 03:45 hat Fam Zheng geschrieben:
> > On Thu, 11/09 21:43, Max Reitz wrote:
> > > Draining a BDS may lead to graph modifications, which in turn may result
> > > in it and other BDS being stripped of their current references.  If
> > > bdrv_drain_all_begin() and bdrv_drain_all_end() do not keep strong
> > > references themselves, the BDS they are trying to drain (or undrain) may
> > > disappear right under their feet -- or, more specifically, under the
> > > feet of BDRV_POLL_WHILE() in bdrv_drain_recurse().
> > > 
> > > This fixes an occasional hang of iotest 194.
> > > 
> > > Signed-off-by: Max Reitz <mreitz@redhat.com>
> > > ---
> > >  block/io.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 44 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/block/io.c b/block/io.c
> > > index 3d5ef2cabe..a0a2833e8e 100644
> > > --- a/block/io.c
> > > +++ b/block/io.c
> > > @@ -340,7 +340,10 @@ void bdrv_drain_all_begin(void)
> > >      bool waited = true;
> > >      BlockDriverState *bs;
> > >      BdrvNextIterator it;
> > > -    GSList *aio_ctxs = NULL, *ctx;
> > > +    GSList *aio_ctxs = NULL, *ctx, *bs_list = NULL, *bs_list_entry;
> > > +
> > > +    /* Must be called from the main loop */
> > > +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
> > >  
> > >      block_job_pause_all();
> > >  
> > > @@ -355,6 +358,12 @@ void bdrv_drain_all_begin(void)
> > >          if (!g_slist_find(aio_ctxs, aio_context)) {
> > >              aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
> > >          }
> > > +
> > > +        /* Keep a strong reference to all root BDS and copy them into
> > > +         * an own list because draining them may lead to graph
> > > +         * modifications. */
> > > +        bdrv_ref(bs);
> > > +        bs_list = g_slist_prepend(bs_list, bs);
> > >      }
> > >  
> > >      /* Note that completion of an asynchronous I/O operation can trigger any
> > > @@ -370,7 +379,11 @@ void bdrv_drain_all_begin(void)
> > >              AioContext *aio_context = ctx->data;
> > >  
> > >              aio_context_acquire(aio_context);
> > > -            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> > > +            for (bs_list_entry = bs_list; bs_list_entry;
> > > +                 bs_list_entry = bs_list_entry->next)
> > > +            {
> > > +                bs = bs_list_entry->data;
> > > +
> > >                  if (aio_context == bdrv_get_aio_context(bs)) {
> > >                      waited |= bdrv_drain_recurse(bs, true);
> > >                  }
> > > @@ -379,24 +392,52 @@ void bdrv_drain_all_begin(void)
> > >          }
> > >      }
> > >  
> > > +    for (bs_list_entry = bs_list; bs_list_entry;
> > > +         bs_list_entry = bs_list_entry->next)
> > > +    {
> > > +        bdrv_unref(bs_list_entry->data);
> > > +    }
> > > +
> > >      g_slist_free(aio_ctxs);
> > > +    g_slist_free(bs_list);
> > >  }
> > >  
> > >  void bdrv_drain_all_end(void)
> > >  {
> > >      BlockDriverState *bs;
> > >      BdrvNextIterator it;
> > > +    GSList *bs_list = NULL, *bs_list_entry;
> > > +
> > > +    /* Must be called from the main loop */
> > > +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
> > >  
> > > +    /* Keep a strong reference to all root BDS and copy them into an
> > > +     * own list because draining them may lead to graph modifications.
> > > +     */
> > >      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> > > -        AioContext *aio_context = bdrv_get_aio_context(bs);
> > > +        bdrv_ref(bs);
> > > +        bs_list = g_slist_prepend(bs_list, bs);
> > > +    }
> > > +
> > > +    for (bs_list_entry = bs_list; bs_list_entry;
> > > +         bs_list_entry = bs_list_entry->next)
> > > +    {
> > > +        AioContext *aio_context;
> > > +
> > > +        bs = bs_list_entry->data;
> > > +        aio_context = bdrv_get_aio_context(bs);
> > >  
> > >          aio_context_acquire(aio_context);
> > >          aio_enable_external(aio_context);
> > >          bdrv_parent_drained_end(bs);
> > >          bdrv_drain_recurse(bs, false);
> > >          aio_context_release(aio_context);
> > > +
> > > +        bdrv_unref(bs);
> > >      }
> > >  
> > > +    g_slist_free(bs_list);
> > > +
> > >      block_job_resume_all();
> > >  }
> > 
> > It is better to put the references into BdrvNextIterator and introduce
> > bdrv_next_iterator_destroy() to free them? You'll need to touch all callers
> > because it is not C++, but it secures all of rest, which seems vulnerable in the
> > same pattern, for example the aio_poll() in iothread_stop_all().
> 
> You could automatically free the references when bdrv_next() returns
> NULL. Then you need an explicit bdrv_next_iterator_destroy() only for
> callers that stop iterating halfway through the list.

Yes, good idea.

> 
> Do you actually need to keep references to all BDSes in the whole list
> while using the iterator or would it be enough to just keep a reference
> to the current one?

To fix the bug we now see I think keeping the current is enough, but I think
implementing just like this patch is also good with some future-proofing: we
cannot know what will be wedged into the nexted aio_poll()'s over time (and yes,
we should really reduce the number of them.)

Fam
Max Reitz Nov. 10, 2017, 3:23 p.m. UTC | #6
On 2017-11-10 14:32, Fam Zheng wrote:
> On Fri, 11/10 14:17, Kevin Wolf wrote:
>> Am 10.11.2017 um 03:45 hat Fam Zheng geschrieben:
>>> On Thu, 11/09 21:43, Max Reitz wrote:
>>>> Draining a BDS may lead to graph modifications, which in turn may result
>>>> in it and other BDS being stripped of their current references.  If
>>>> bdrv_drain_all_begin() and bdrv_drain_all_end() do not keep strong
>>>> references themselves, the BDS they are trying to drain (or undrain) may
>>>> disappear right under their feet -- or, more specifically, under the
>>>> feet of BDRV_POLL_WHILE() in bdrv_drain_recurse().
>>>>
>>>> This fixes an occasional hang of iotest 194.
>>>>
>>>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>>>> ---
>>>>  block/io.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>>>>  1 file changed, 44 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/block/io.c b/block/io.c
>>>> index 3d5ef2cabe..a0a2833e8e 100644
>>>> --- a/block/io.c
>>>> +++ b/block/io.c
>>>> @@ -340,7 +340,10 @@ void bdrv_drain_all_begin(void)
>>>>      bool waited = true;
>>>>      BlockDriverState *bs;
>>>>      BdrvNextIterator it;
>>>> -    GSList *aio_ctxs = NULL, *ctx;
>>>> +    GSList *aio_ctxs = NULL, *ctx, *bs_list = NULL, *bs_list_entry;
>>>> +
>>>> +    /* Must be called from the main loop */
>>>> +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
>>>>  
>>>>      block_job_pause_all();
>>>>  
>>>> @@ -355,6 +358,12 @@ void bdrv_drain_all_begin(void)
>>>>          if (!g_slist_find(aio_ctxs, aio_context)) {
>>>>              aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
>>>>          }
>>>> +
>>>> +        /* Keep a strong reference to all root BDS and copy them into
>>>> +         * an own list because draining them may lead to graph
>>>> +         * modifications. */
>>>> +        bdrv_ref(bs);
>>>> +        bs_list = g_slist_prepend(bs_list, bs);
>>>>      }
>>>>  
>>>>      /* Note that completion of an asynchronous I/O operation can trigger any
>>>> @@ -370,7 +379,11 @@ void bdrv_drain_all_begin(void)
>>>>              AioContext *aio_context = ctx->data;
>>>>  
>>>>              aio_context_acquire(aio_context);
>>>> -            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
>>>> +            for (bs_list_entry = bs_list; bs_list_entry;
>>>> +                 bs_list_entry = bs_list_entry->next)
>>>> +            {
>>>> +                bs = bs_list_entry->data;
>>>> +
>>>>                  if (aio_context == bdrv_get_aio_context(bs)) {
>>>>                      waited |= bdrv_drain_recurse(bs, true);
>>>>                  }
>>>> @@ -379,24 +392,52 @@ void bdrv_drain_all_begin(void)
>>>>          }
>>>>      }
>>>>  
>>>> +    for (bs_list_entry = bs_list; bs_list_entry;
>>>> +         bs_list_entry = bs_list_entry->next)
>>>> +    {
>>>> +        bdrv_unref(bs_list_entry->data);
>>>> +    }
>>>> +
>>>>      g_slist_free(aio_ctxs);
>>>> +    g_slist_free(bs_list);
>>>>  }
>>>>  
>>>>  void bdrv_drain_all_end(void)
>>>>  {
>>>>      BlockDriverState *bs;
>>>>      BdrvNextIterator it;
>>>> +    GSList *bs_list = NULL, *bs_list_entry;
>>>> +
>>>> +    /* Must be called from the main loop */
>>>> +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
>>>>  
>>>> +    /* Keep a strong reference to all root BDS and copy them into an
>>>> +     * own list because draining them may lead to graph modifications.
>>>> +     */
>>>>      for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
>>>> -        AioContext *aio_context = bdrv_get_aio_context(bs);
>>>> +        bdrv_ref(bs);
>>>> +        bs_list = g_slist_prepend(bs_list, bs);
>>>> +    }
>>>> +
>>>> +    for (bs_list_entry = bs_list; bs_list_entry;
>>>> +         bs_list_entry = bs_list_entry->next)
>>>> +    {
>>>> +        AioContext *aio_context;
>>>> +
>>>> +        bs = bs_list_entry->data;
>>>> +        aio_context = bdrv_get_aio_context(bs);
>>>>  
>>>>          aio_context_acquire(aio_context);
>>>>          aio_enable_external(aio_context);
>>>>          bdrv_parent_drained_end(bs);
>>>>          bdrv_drain_recurse(bs, false);
>>>>          aio_context_release(aio_context);
>>>> +
>>>> +        bdrv_unref(bs);
>>>>      }
>>>>  
>>>> +    g_slist_free(bs_list);
>>>> +
>>>>      block_job_resume_all();
>>>>  }
>>>
>>> It is better to put the references into BdrvNextIterator and introduce
>>> bdrv_next_iterator_destroy() to free them? You'll need to touch all callers
>>> because it is not C++, but it secures all of rest, which seems vulnerable in the
>>> same pattern, for example the aio_poll() in iothread_stop_all().
>>
>> You could automatically free the references when bdrv_next() returns
>> NULL. Then you need an explicit bdrv_next_iterator_destroy() only for
>> callers that stop iterating halfway through the list.
>> Yes, good idea.

But bdrv_unref() is safe only in the main loop.  Without having checked,
I'm not sure whether all callers of bdrv_next() are running in the main
loop.

I'd rather introduce a bdrv_next_safe() which is used by those callers
which need it.

>> Do you actually need to keep references to all BDSes in the whole list
>> while using the iterator or would it be enough to just keep a reference
>> to the current one?
> 
> To fix the bug we now see I think keeping the current is enough, but I think
> implementing just like this patch is also good with some future-proofing: we
> cannot know what will be wedged into the nexted aio_poll()'s over time (and yes,
> we should really reduce the number of them.)

I don't really want to think about whether it's safe to only keep a
reference to the current BDS.  I can't imagine any case where destroying
one root BDS leads to destroying another, but I'd rather be safe and not
have to think about it.  (Unless there is an important reason to only
keep a strong reference to the current one.)

Max
Max Reitz Nov. 10, 2017, 3:26 p.m. UTC | #7
On 2017-11-10 10:19, Stefan Hajnoczi wrote:
> On Thu, Nov 09, 2017 at 09:43:15PM +0100, Max Reitz wrote:
>> Draining a BDS may lead to graph modifications, which in turn may result
>> in it and other BDS being stripped of their current references.  If
>> bdrv_drain_all_begin() and bdrv_drain_all_end() do not keep strong
>> references themselves, the BDS they are trying to drain (or undrain) may
>> disappear right under their feet -- or, more specifically, under the
>> feet of BDRV_POLL_WHILE() in bdrv_drain_recurse().
>>
>> This fixes an occasional hang of iotest 194.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>  block/io.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 44 insertions(+), 3 deletions(-)
>>
>> diff --git a/block/io.c b/block/io.c
>> index 3d5ef2cabe..a0a2833e8e 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -340,7 +340,10 @@ void bdrv_drain_all_begin(void)
>>      bool waited = true;
>>      BlockDriverState *bs;
>>      BdrvNextIterator it;
>> -    GSList *aio_ctxs = NULL, *ctx;
>> +    GSList *aio_ctxs = NULL, *ctx, *bs_list = NULL, *bs_list_entry;
>> +
>> +    /* Must be called from the main loop */
>> +    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
>>  
>>      block_job_pause_all();
>>  
>> @@ -355,6 +358,12 @@ void bdrv_drain_all_begin(void)
>>          if (!g_slist_find(aio_ctxs, aio_context)) {
>>              aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
>>          }
>> +
>> +        /* Keep a strong reference to all root BDS and copy them into
>> +         * an own list because draining them may lead to graph
>> +         * modifications. */
>> +        bdrv_ref(bs);
>> +        bs_list = g_slist_prepend(bs_list, bs);
>>      }
>>  
>>      /* Note that completion of an asynchronous I/O operation can trigger any
>> @@ -370,7 +379,11 @@ void bdrv_drain_all_begin(void)
>>              AioContext *aio_context = ctx->data;
>>  
>>              aio_context_acquire(aio_context);
>> -            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
>> +            for (bs_list_entry = bs_list; bs_list_entry;
>> +                 bs_list_entry = bs_list_entry->next)
>> +            {
>> +                bs = bs_list_entry->data;
>> +
>>                  if (aio_context == bdrv_get_aio_context(bs)) {
>>                      waited |= bdrv_drain_recurse(bs, true);
>>                  }
>> @@ -379,24 +392,52 @@ void bdrv_drain_all_begin(void)
>>          }
>>      }
>>  
>> +    for (bs_list_entry = bs_list; bs_list_entry;
>> +         bs_list_entry = bs_list_entry->next)
>> +    {
>> +        bdrv_unref(bs_list_entry->data);
>> +    }
>> +
>>      g_slist_free(aio_ctxs);
>> +    g_slist_free(bs_list);
>>  }
> 
> Which specific parts of this function access bs without a reference?
> 
> I see bdrv_next() may do QTAILQ_NEXT(bs, monitor_list) after
> bdrv_drain_recurse() has returned.
> 
> Anything else?
> 
> If bdrv_next() is the only issue then I agree with Fam that it makes
> sense to build the ref/unref into bdrv_next().

These don't.  It's BDRV_POLL_WHILE() in bdrv_drain_recurse(), as written
in the commit message.

You cannot add a bdrv_ref()/bdrv_unref() pair there because
bdrv_drain_recurse() is called from bdrv_close() with a refcount of 0,
so having a bdrv_unref() there would cause an infinite recursion.

I think it's reasonable to expect callers of any bdrv_* function (except
for bdrv_unref()) to make sure that the BDS isn't deleted over the
course of that function.  Therefore, I think that the actual issue is
here and we need to make sure here that we have a strong reference
before invoking bdrv_drain_recurse().

Max
Fam Zheng Nov. 10, 2017, 3:31 p.m. UTC | #8
On Fri, 11/10 16:23, Max Reitz wrote:
> But bdrv_unref() is safe only in the main loop.  Without having checked,
> I'm not sure whether all callers of bdrv_next() are running in the main
> loop.

They must be. The reasoning is simple:

1) one needs to acquire the ctx of all the BDSes for safe access;
2) only main loop can acquire any BDS' ctx;

So there is no way bdrv_next can work in an IOThread.

Fam
Kevin Wolf Nov. 10, 2017, 4:05 p.m. UTC | #9
Am 10.11.2017 um 16:23 hat Max Reitz geschrieben:
> On 2017-11-10 14:32, Fam Zheng wrote:
> > On Fri, 11/10 14:17, Kevin Wolf wrote:
> >> Do you actually need to keep references to all BDSes in the whole list
> >> while using the iterator or would it be enough to just keep a reference
> >> to the current one?
> > 
> > To fix the bug we now see I think keeping the current is enough, but I think
> > implementing just like this patch is also good with some future-proofing: we
> > cannot know what will be wedged into the nexted aio_poll()'s over time (and yes,
> > we should really reduce the number of them.)
> 
> I don't really want to think about whether it's safe to only keep a
> reference to the current BDS.  I can't imagine any case where destroying
> one root BDS leads to destroying another, but I'd rather be safe and not
> have to think about it.  (Unless there is an important reason to only
> keep a strong reference to the current one.)

Why would it be a problem if another BDS from the list went away? If it
is one that was already processed, we don't care, and if it was in the
yet unprocessed part of the list, we'll just never return it.

Kevin
Max Reitz Nov. 10, 2017, 4:13 p.m. UTC | #10
On 2017-11-10 17:05, Kevin Wolf wrote:
> Am 10.11.2017 um 16:23 hat Max Reitz geschrieben:
>> On 2017-11-10 14:32, Fam Zheng wrote:
>>> On Fri, 11/10 14:17, Kevin Wolf wrote:
>>>> Do you actually need to keep references to all BDSes in the whole list
>>>> while using the iterator or would it be enough to just keep a reference
>>>> to the current one?
>>>
>>> To fix the bug we now see I think keeping the current is enough, but I think
>>> implementing just like this patch is also good with some future-proofing: we
>>> cannot know what will be wedged into the nexted aio_poll()'s over time (and yes,
>>> we should really reduce the number of them.)
>>
>> I don't really want to think about whether it's safe to only keep a
>> reference to the current BDS.  I can't imagine any case where destroying
>> one root BDS leads to destroying another, but I'd rather be safe and not
>> have to think about it.  (Unless there is an important reason to only
>> keep a strong reference to the current one.)
> 
> Why would it be a problem if another BDS from the list went away? If it
> is one that was already processed, we don't care, and if it was in the
> yet unprocessed part of the list, we'll just never return it.

You mean from bdrv_next() in its current form?  Well, I know that when I
just put a bdrv_ref()/bdrv_unref() pair around the drain, I got a
segfault in blk_all_next() in bdrv_next().  I can investigate more, but
that's pretty much what I mean by "I don't really want to think about it".

So that's why I want bdrv_next() to copy all BDS into another list
instead of iterating through them on the fly.  And if we do that, a
disappearing BDS of course is an issue because we don't notice until
we're trying to iterate over it, at which point we have a use-after-free.

Max
Kevin Wolf Nov. 10, 2017, 4:22 p.m. UTC | #11
Am 10.11.2017 um 17:13 hat Max Reitz geschrieben:
> On 2017-11-10 17:05, Kevin Wolf wrote:
> > Am 10.11.2017 um 16:23 hat Max Reitz geschrieben:
> >> On 2017-11-10 14:32, Fam Zheng wrote:
> >>> On Fri, 11/10 14:17, Kevin Wolf wrote:
> >>>> Do you actually need to keep references to all BDSes in the whole list
> >>>> while using the iterator or would it be enough to just keep a reference
> >>>> to the current one?
> >>>
> >>> To fix the bug we now see I think keeping the current is enough, but I think
> >>> implementing just like this patch is also good with some future-proofing: we
> >>> cannot know what will be wedged into the nexted aio_poll()'s over time (and yes,
> >>> we should really reduce the number of them.)
> >>
> >> I don't really want to think about whether it's safe to only keep a
> >> reference to the current BDS.  I can't imagine any case where destroying
> >> one root BDS leads to destroying another, but I'd rather be safe and not
> >> have to think about it.  (Unless there is an important reason to only
> >> keep a strong reference to the current one.)
> > 
> > Why would it be a problem if another BDS from the list went away? If it
> > is one that was already processed, we don't care, and if it was in the
> > yet unprocessed part of the list, we'll just never return it.
> 
> You mean from bdrv_next() in its current form?  Well, I know that when I
> just put a bdrv_ref()/bdrv_unref() pair around the drain, I got a
> segfault in blk_all_next() in bdrv_next().  I can investigate more, but
> that's pretty much what I mean by "I don't really want to think about it".

No, I mean a bdrv_next() that is modified to bdrv_ref() only what
it->blk/it->bs point to currently instead of allocating a whole list.

Kevin

> So that's why I want bdrv_next() to copy all BDS into another list
> instead of iterating through them on the fly.  And if we do that, a
> disappearing BDS of course is an issue because we don't notice until
> we're trying to iterate over it, at which point we have a use-after-free.
> 
> Max
Max Reitz Nov. 10, 2017, 4:43 p.m. UTC | #12
On 2017-11-10 17:22, Kevin Wolf wrote:
> Am 10.11.2017 um 17:13 hat Max Reitz geschrieben:
>> On 2017-11-10 17:05, Kevin Wolf wrote:
>>> Am 10.11.2017 um 16:23 hat Max Reitz geschrieben:
>>>> On 2017-11-10 14:32, Fam Zheng wrote:
>>>>> On Fri, 11/10 14:17, Kevin Wolf wrote:
>>>>>> Do you actually need to keep references to all BDSes in the whole list
>>>>>> while using the iterator or would it be enough to just keep a reference
>>>>>> to the current one?
>>>>>
>>>>> To fix the bug we now see I think keeping the current is enough, but I think
>>>>> implementing just like this patch is also good with some future-proofing: we
>>>>> cannot know what will be wedged into the nexted aio_poll()'s over time (and yes,
>>>>> we should really reduce the number of them.)
>>>>
>>>> I don't really want to think about whether it's safe to only keep a
>>>> reference to the current BDS.  I can't imagine any case where destroying
>>>> one root BDS leads to destroying another, but I'd rather be safe and not
>>>> have to think about it.  (Unless there is an important reason to only
>>>> keep a strong reference to the current one.)
>>>
>>> Why would it be a problem if another BDS from the list went away? If it
>>> is one that was already processed, we don't care, and if it was in the
>>> yet unprocessed part of the list, we'll just never return it.
>>
>> You mean from bdrv_next() in its current form?  Well, I know that when I
>> just put a bdrv_ref()/bdrv_unref() pair around the drain, I got a
>> segfault in blk_all_next() in bdrv_next().  I can investigate more, but
>> that's pretty much what I mean by "I don't really want to think about it".
> 
> No, I mean a bdrv_next() that is modified to bdrv_ref() only what
> it->blk/it->bs point to currently instead of allocating a whole list.

Seems to work, I guess my issue was that I unref'd the BDS too early
(should do that only after the next pointer has been fetched...).

This also means adding a new function like bdrv_next_cleanup() that has
to be called if you don't want to iterate over all of the BDSs, but I
guess it's still less code to write.

Max
diff mbox series

Patch

diff --git a/block/io.c b/block/io.c
index 3d5ef2cabe..a0a2833e8e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -340,7 +340,10 @@  void bdrv_drain_all_begin(void)
     bool waited = true;
     BlockDriverState *bs;
     BdrvNextIterator it;
-    GSList *aio_ctxs = NULL, *ctx;
+    GSList *aio_ctxs = NULL, *ctx, *bs_list = NULL, *bs_list_entry;
+
+    /* Must be called from the main loop */
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 
     block_job_pause_all();
 
@@ -355,6 +358,12 @@  void bdrv_drain_all_begin(void)
         if (!g_slist_find(aio_ctxs, aio_context)) {
             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
         }
+
+        /* Keep a strong reference to all root BDS and copy them into
+         * an own list because draining them may lead to graph
+         * modifications. */
+        bdrv_ref(bs);
+        bs_list = g_slist_prepend(bs_list, bs);
     }
 
     /* Note that completion of an asynchronous I/O operation can trigger any
@@ -370,7 +379,11 @@  void bdrv_drain_all_begin(void)
             AioContext *aio_context = ctx->data;
 
             aio_context_acquire(aio_context);
-            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+            for (bs_list_entry = bs_list; bs_list_entry;
+                 bs_list_entry = bs_list_entry->next)
+            {
+                bs = bs_list_entry->data;
+
                 if (aio_context == bdrv_get_aio_context(bs)) {
                     waited |= bdrv_drain_recurse(bs, true);
                 }
@@ -379,24 +392,52 @@  void bdrv_drain_all_begin(void)
         }
     }
 
+    for (bs_list_entry = bs_list; bs_list_entry;
+         bs_list_entry = bs_list_entry->next)
+    {
+        bdrv_unref(bs_list_entry->data);
+    }
+
     g_slist_free(aio_ctxs);
+    g_slist_free(bs_list);
 }
 
 void bdrv_drain_all_end(void)
 {
     BlockDriverState *bs;
     BdrvNextIterator it;
+    GSList *bs_list = NULL, *bs_list_entry;
+
+    /* Must be called from the main loop */
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 
+    /* Keep a strong reference to all root BDS and copy them into an
+     * own list because draining them may lead to graph modifications.
+     */
     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
-        AioContext *aio_context = bdrv_get_aio_context(bs);
+        bdrv_ref(bs);
+        bs_list = g_slist_prepend(bs_list, bs);
+    }
+
+    for (bs_list_entry = bs_list; bs_list_entry;
+         bs_list_entry = bs_list_entry->next)
+    {
+        AioContext *aio_context;
+
+        bs = bs_list_entry->data;
+        aio_context = bdrv_get_aio_context(bs);
 
         aio_context_acquire(aio_context);
         aio_enable_external(aio_context);
         bdrv_parent_drained_end(bs);
         bdrv_drain_recurse(bs, false);
         aio_context_release(aio_context);
+
+        bdrv_unref(bs);
     }
 
+    g_slist_free(bs_list);
+
     block_job_resume_all();
 }