diff mbox

[15/18] block: only call aio_poll on the current thread's AioContext

Message ID 1476380062-18001-16-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Oct. 13, 2016, 5:34 p.m. UTC
aio_poll is not thread safe; for example bdrv_drain can hang if
the last in-flight I/O operation is completed in the I/O thread after
the main thread has checked bs->in_flight.

The bug remains latent as long as all of it is called within
aio_context_acquire/aio_context_release, but this will change soon.

To fix this, if bdrv_drain is called from outside the I/O thread,
signal the main AioContext through a dummy bottom half.  The event
loop then only runs in the I/O thread.

Reviewed-by: Fam Zheng <famz@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 async.c                   |  1 +
 block.c                   |  2 ++
 block/io.c                |  7 +++++++
 include/block/block.h     | 24 +++++++++++++++++++++---
 include/block/block_int.h |  1 +
 5 files changed, 32 insertions(+), 3 deletions(-)

Comments

Fam Zheng Oct. 14, 2016, 2:55 p.m. UTC | #1
On Thu, 10/13 19:34, Paolo Bonzini wrote:
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 11f877b..0516f62 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -470,6 +470,7 @@ struct BlockDriverState {
>      NotifierWithReturnList before_write_notifiers;
>  
>      /* number of in-flight requests; overall and serialising */
> +    bool wakeup;

This should probably be move above the in-flight comment.

Fam

>      unsigned int in_flight;
>      unsigned int serialising_in_flight;
>  
> -- 
> 2.7.4
> 
>
Stefan Hajnoczi Oct. 16, 2016, 4:40 p.m. UTC | #2
On Thu, Oct 13, 2016 at 07:34:19PM +0200, Paolo Bonzini wrote:
> aio_poll is not thread safe; for example bdrv_drain can hang if
> the last in-flight I/O operation is completed in the I/O thread after
> the main thread has checked bs->in_flight.
> 
> The bug remains latent as long as all of it is called within
> aio_context_acquire/aio_context_release, but this will change soon.
> 
> To fix this, if bdrv_drain is called from outside the I/O thread,
> signal the main AioContext through a dummy bottom half.  The event
> loop then only runs in the I/O thread.
> 
> Reviewed-by: Fam Zheng <famz@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  async.c                   |  1 +
>  block.c                   |  2 ++
>  block/io.c                |  7 +++++++
>  include/block/block.h     | 24 +++++++++++++++++++++---
>  include/block/block_int.h |  1 +
>  5 files changed, 32 insertions(+), 3 deletions(-)
> 
> diff --git a/async.c b/async.c
> index f30d011..fb37b03 100644
> --- a/async.c
> +++ b/async.c
> @@ -61,6 +61,7 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
>      smp_wmb();
>      ctx->first_bh = bh;
>      qemu_mutex_unlock(&ctx->bh_lock);
> +    aio_notify(ctx);
>  }
>  
>  QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
> diff --git a/block.c b/block.c
> index fbe485c..a17baab 100644
> --- a/block.c
> +++ b/block.c
> @@ -2090,7 +2090,9 @@ int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
>  
>      assert(bs_queue != NULL);
>  
> +    aio_context_release(ctx);
>      bdrv_drain_all();
> +    aio_context_acquire(ctx);
>  
>      QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
>          if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
> diff --git a/block/io.c b/block/io.c
> index 7d3dcfc..cd28909 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -474,8 +474,15 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
>      atomic_inc(&bs->in_flight);
>  }
>  
> +static void dummy_bh_cb(void *opaque)
> +{
> +}
> +
>  void bdrv_wakeup(BlockDriverState *bs)
>  {
> +    if (bs->wakeup) {
> +        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
> +    }
>  }

Why use a dummy BH instead of aio_notify()?

>  
>  void bdrv_dec_in_flight(BlockDriverState *bs)
> diff --git a/include/block/block.h b/include/block/block.h
> index ba4318b..72d5d8e 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -343,9 +343,27 @@ void bdrv_drain_all(void);
>  #define bdrv_poll_while(bs, cond) ({                       \
>      bool waited_ = false;                                  \
>      BlockDriverState *bs_ = (bs);                          \
> -    while ((cond)) {                                       \
> -        aio_poll(bdrv_get_aio_context(bs_), true);         \
> -        waited_ = true;                                    \
> +    AioContext *ctx_ = bdrv_get_aio_context(bs_);          \
> +    if (aio_context_in_iothread(ctx_)) {                   \
> +        while ((cond)) {                                   \
> +            aio_poll(ctx_, true);                          \
> +            waited_ = true;                                \
> +        }                                                  \
> +    } else {                                               \
> +        assert(qemu_get_current_aio_context() ==           \
> +               qemu_get_aio_context());                    \

The assumption is that IOThread #1 will never call bdrv_poll_while() on
IOThread #2's AioContext.  I believe that is true today.  Is this what
you had in mind?

Please add a comment since it's not completely obvious from the assert
expression.

> +        /* Ask bdrv_dec_in_flight to wake up the main      \
> +         * QEMU AioContext.                                \
> +         */                                                \
> +        assert(!bs_->wakeup);                              \
> +        bs_->wakeup = true;                                \
> +        while ((cond)) {                                   \
> +            aio_context_release(ctx_);                     \
> +            aio_poll(qemu_get_aio_context(), true);        \
> +            aio_context_acquire(ctx_);                     \
> +            waited_ = true;                                \
> +        }                                                  \
> +        bs_->wakeup = false;                               \
>      }                                                      \
>      waited_; })
>  
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 11f877b..0516f62 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -470,6 +470,7 @@ struct BlockDriverState {
>      NotifierWithReturnList before_write_notifiers;
>  
>      /* number of in-flight requests; overall and serialising */
> +    bool wakeup;
>      unsigned int in_flight;
>      unsigned int serialising_in_flight;
>  
> -- 
> 2.7.4
> 
> 
>
Paolo Bonzini Oct. 17, 2016, 8:04 a.m. UTC | #3
On 16/10/2016 18:40, Stefan Hajnoczi wrote:
> >  void bdrv_wakeup(BlockDriverState *bs)
> >  {
> > +    if (bs->wakeup) {
> > +        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
> > +    }
> >  }
> 
> Why use a dummy BH instead of aio_notify()?

Originally I used aio_bh_schedule_oneshot() because aio_notify() is not
enough for aio_poll() to return true.  It's also true that I am not
using anymore the result of aio_poll, though.

Since this is not a fast path and it's not very much stressed by
qemu-iotests, I think it's better if we can move towards making
aio_notify() more or less an internal detail.  If you prefer
aio_notify(), however, I can look into that as well.

Thanks,

Paolo

>> >  void bdrv_dec_in_flight(BlockDriverState *bs)
>> > diff --git a/include/block/block.h b/include/block/block.h
>> > index ba4318b..72d5d8e 100644
>> > --- a/include/block/block.h
>> > +++ b/include/block/block.h
>> > @@ -343,9 +343,27 @@ void bdrv_drain_all(void);
>> >  #define bdrv_poll_while(bs, cond) ({                       \
>> >      bool waited_ = false;                                  \
>> >      BlockDriverState *bs_ = (bs);                          \
>> > -    while ((cond)) {                                       \
>> > -        aio_poll(bdrv_get_aio_context(bs_), true);         \
>> > -        waited_ = true;                                    \
>> > +    AioContext *ctx_ = bdrv_get_aio_context(bs_);          \
>> > +    if (aio_context_in_iothread(ctx_)) {                   \
>> > +        while ((cond)) {                                   \
>> > +            aio_poll(ctx_, true);                          \
>> > +            waited_ = true;                                \
>> > +        }                                                  \
>> > +    } else {                                               \
>> > +        assert(qemu_get_current_aio_context() ==           \
>> > +               qemu_get_aio_context());                    \
> The assumption is that IOThread #1 will never call bdrv_poll_while() on
> IOThread #2's AioContext.  I believe that is true today.  Is this what
> you had in mind?
> 
> Please add a comment since it's not completely obvious from the assert
> expression.
>
Stefan Hajnoczi Oct. 18, 2016, 10:10 a.m. UTC | #4
On Mon, Oct 17, 2016 at 10:04:59AM +0200, Paolo Bonzini wrote:
> 
> 
> On 16/10/2016 18:40, Stefan Hajnoczi wrote:
> > >  void bdrv_wakeup(BlockDriverState *bs)
> > >  {
> > > +    if (bs->wakeup) {
> > > +        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
> > > +    }
> > >  }
> > 
> > Why use a dummy BH instead of aio_notify()?
> 
> Originally I used aio_bh_schedule_oneshot() because aio_notify() is not
> enough for aio_poll() to return true.  It's also true that I am not
> using anymore the result of aio_poll, though.
> 
> Since this is not a fast path and it's not very much stressed by
> qemu-iotests, I think it's better if we can move towards making
> aio_notify() more or less an internal detail.  If you prefer
> aio_notify(), however, I can look into that as well.

I was just wondering if there is a reason that I missed.

Stefan
diff mbox

Patch

diff --git a/async.c b/async.c
index f30d011..fb37b03 100644
--- a/async.c
+++ b/async.c
@@ -61,6 +61,7 @@  void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
     smp_wmb();
     ctx->first_bh = bh;
     qemu_mutex_unlock(&ctx->bh_lock);
+    aio_notify(ctx);
 }
 
 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
diff --git a/block.c b/block.c
index fbe485c..a17baab 100644
--- a/block.c
+++ b/block.c
@@ -2090,7 +2090,9 @@  int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **er
 
     assert(bs_queue != NULL);
 
+    aio_context_release(ctx);
     bdrv_drain_all();
+    aio_context_acquire(ctx);
 
     QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
         if (bdrv_reopen_prepare(&bs_entry->state, bs_queue, &local_err)) {
diff --git a/block/io.c b/block/io.c
index 7d3dcfc..cd28909 100644
--- a/block/io.c
+++ b/block/io.c
@@ -474,8 +474,15 @@  void bdrv_inc_in_flight(BlockDriverState *bs)
     atomic_inc(&bs->in_flight);
 }
 
+static void dummy_bh_cb(void *opaque)
+{
+}
+
 void bdrv_wakeup(BlockDriverState *bs)
 {
+    if (bs->wakeup) {
+        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
+    }
 }
 
 void bdrv_dec_in_flight(BlockDriverState *bs)
diff --git a/include/block/block.h b/include/block/block.h
index ba4318b..72d5d8e 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -343,9 +343,27 @@  void bdrv_drain_all(void);
 #define bdrv_poll_while(bs, cond) ({                       \
     bool waited_ = false;                                  \
     BlockDriverState *bs_ = (bs);                          \
-    while ((cond)) {                                       \
-        aio_poll(bdrv_get_aio_context(bs_), true);         \
-        waited_ = true;                                    \
+    AioContext *ctx_ = bdrv_get_aio_context(bs_);          \
+    if (aio_context_in_iothread(ctx_)) {                   \
+        while ((cond)) {                                   \
+            aio_poll(ctx_, true);                          \
+            waited_ = true;                                \
+        }                                                  \
+    } else {                                               \
+        assert(qemu_get_current_aio_context() ==           \
+               qemu_get_aio_context());                    \
+        /* Ask bdrv_dec_in_flight to wake up the main      \
+         * QEMU AioContext.                                \
+         */                                                \
+        assert(!bs_->wakeup);                              \
+        bs_->wakeup = true;                                \
+        while ((cond)) {                                   \
+            aio_context_release(ctx_);                     \
+            aio_poll(qemu_get_aio_context(), true);        \
+            aio_context_acquire(ctx_);                     \
+            waited_ = true;                                \
+        }                                                  \
+        bs_->wakeup = false;                               \
     }                                                      \
     waited_; })
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 11f877b..0516f62 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -470,6 +470,7 @@  struct BlockDriverState {
     NotifierWithReturnList before_write_notifiers;
 
     /* number of in-flight requests; overall and serialising */
+    bool wakeup;
     unsigned int in_flight;
     unsigned int serialising_in_flight;