diff mbox

[01/18] iothread: release iothread around aio_poll

Message ID 1438868176-20364-2-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Aug. 6, 2015, 1:35 p.m. UTC
This is the first step towards having fine-grained critical sections in
dataplane threads, which resolves lock ordering problems between
address_space_* functions (which need the BQL when doing MMIO, even
after we complete RCU-based dispatch) and the AioContext.

Because AioContext does not use contention callbacks anymore, the
unit test has to be changed.

Previously applied as a0710f7995f914e3044e5899bd8ff6c43c62f916 and
then reverted.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 async.c                     | 22 +++-------------------
 docs/multiple-iothreads.txt | 25 +++++++++++++++----------
 include/block/aio.h         |  3 ---
 iothread.c                  | 11 ++---------
 tests/test-aio.c            | 19 +++++++++++--------
 5 files changed, 31 insertions(+), 49 deletions(-)

Comments

Fam Zheng Sept. 9, 2015, 6:06 a.m. UTC | #1
On Thu, 08/06 15:35, Paolo Bonzini wrote:
> This is the first step towards having fine-grained critical sections in
> dataplane threads, which resolves lock ordering problems between
> address_space_* functions (which need the BQL when doing MMIO, even
> after we complete RCU-based dispatch) and the AioContext.
> 
> Because AioContext does not use contention callbacks anymore, the
> unit test has to be changed.
> 
> Previously applied as a0710f7995f914e3044e5899bd8ff6c43c62f916 and
> then reverted.
> 
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  async.c                     | 22 +++-------------------
>  docs/multiple-iothreads.txt | 25 +++++++++++++++----------
>  include/block/aio.h         |  3 ---
>  iothread.c                  | 11 ++---------
>  tests/test-aio.c            | 19 +++++++++++--------
>  5 files changed, 31 insertions(+), 49 deletions(-)
> 
> diff --git a/async.c b/async.c
> index efce14b..f992914 100644
> --- a/async.c
> +++ b/async.c
> @@ -79,8 +79,8 @@ int aio_bh_poll(AioContext *ctx)
>           * aio_notify again if necessary.
>           */
>          if (!bh->deleted && atomic_xchg(&bh->scheduled, 0)) {
> -            /* Idle BHs and the notify BH don't count as progress */
> -            if (!bh->idle && bh != ctx->notify_dummy_bh) {
> +            /* Idle BHs don't count as progress */
> +            if (!bh->idle) {
>                  ret = 1;
>              }
>              bh->idle = 0;
> @@ -232,7 +232,6 @@ aio_ctx_finalize(GSource     *source)
>  {
>      AioContext *ctx = (AioContext *) source;
>  
> -    qemu_bh_delete(ctx->notify_dummy_bh);
>      thread_pool_free(ctx->thread_pool);
>  
>      qemu_mutex_lock(&ctx->bh_lock);
> @@ -299,19 +298,6 @@ static void aio_timerlist_notify(void *opaque)
>      aio_notify(opaque);
>  }
>  
> -static void aio_rfifolock_cb(void *opaque)
> -{
> -    AioContext *ctx = opaque;
> -
> -    /* Kick owner thread in case they are blocked in aio_poll() */
> -    qemu_bh_schedule(ctx->notify_dummy_bh);
> -}
> -
> -static void notify_dummy_bh(void *opaque)
> -{
> -    /* Do nothing, we were invoked just to force the event loop to iterate */
> -}
> -
>  static void event_notifier_dummy_cb(EventNotifier *e)
>  {
>  }
> @@ -333,11 +319,9 @@ AioContext *aio_context_new(Error **errp)
>                             event_notifier_dummy_cb);
>      ctx->thread_pool = NULL;
>      qemu_mutex_init(&ctx->bh_lock);
> -    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
> +    rfifolock_init(&ctx->lock, NULL, NULL);
>      timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
>  
> -    ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);
> -
>      return ctx;
>  }
>  
> diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt
> index 40b8419..723cc7e 100644
> --- a/docs/multiple-iothreads.txt
> +++ b/docs/multiple-iothreads.txt
> @@ -105,13 +105,10 @@ a BH in the target AioContext beforehand and then call qemu_bh_schedule().  No
>  acquire/release or locking is needed for the qemu_bh_schedule() call.  But be
>  sure to acquire the AioContext for aio_bh_new() if necessary.
>  
> -The relationship between AioContext and the block layer
> --------------------------------------------------------
> -The AioContext originates from the QEMU block layer because it provides a
> -scoped way of running event loop iterations until all work is done.  This
> -feature is used to complete all in-flight block I/O requests (see
> -bdrv_drain_all()).  Nowadays AioContext is a generic event loop that can be
> -used by any QEMU subsystem.
> +AioContext and the block layer
> +------------------------------
> +The AioContext originates from the QEMU block layer, even though nowadays
> +AioContext is a generic event loop that can be used by any QEMU subsystem.
>  
>  The block layer has support for AioContext integrated.  Each BlockDriverState
>  is associated with an AioContext using bdrv_set_aio_context() and
> @@ -122,9 +119,17 @@ Block layer code must therefore expect to run in an IOThread and avoid using
>  old APIs that implicitly use the main loop.  See the "How to program for
>  IOThreads" above for information on how to do that.
>  
> -If main loop code such as a QMP function wishes to access a BlockDriverState it
> -must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the
> -IOThread does not run in parallel.
> +If main loop code such as a QMP function wishes to access a BlockDriverState
> +it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
> +that callbacks in the IOThread do not run in parallel.
> +
> +Code running in the monitor typically uses bdrv_drain() to ensure that
> +past requests from the guest are completed.  When a block device is
> +running in an IOThread, the IOThread can also process requests from
> +the guest (via ioeventfd).  These requests have to be blocked with
> +aio_disable_clients() before calling bdrv_drain().  You can then reenable
> +guest requests with aio_enable_clients() before finally releasing the
> +AioContext and completing the monitor command.

This patch will probably go in before aio_disable_clients, if any, but I'm not
quite confident about the interface yet: listing a precise set of clients from
monitor is an ugly coupling between modules:

    aio_disable_clients(bdrv_get_aio_context(bs),
                        AIO_CLIENT_NBD | AIO_CLIENT_IOEVENTFD | AIO_CLIENT_FOO);

    ...

    aio_enble_clients(bdrv_get_aio_context(bs),
                      AIO_CLIENT_NBD | AIO_CLIENT_IOEVENTFD | AIO_CLIENT_FOO);

I prefer at least an abstraction:

    bdrv_quiesce_begin(bs);

    ...

    bdrv_quiesce_end(bs);

My point is maybe we should leave documenting this to whichever series that
introduces it?

Fam

>  
>  Long-running jobs (usually in the form of coroutines) are best scheduled in the
>  BlockDriverState's AioContext to avoid the need to acquire/release around each
> diff --git a/include/block/aio.h b/include/block/aio.h
> index 400b1b0..9dd32e0 100644
> --- a/include/block/aio.h
> +++ b/include/block/aio.h
> @@ -114,9 +114,6 @@ struct AioContext {
>      bool notified;
>      EventNotifier notifier;
>  
> -    /* Scheduling this BH forces the event loop it iterate */
> -    QEMUBH *notify_dummy_bh;
> -
>      /* Thread pool for performing work and receiving completion callbacks */
>      struct ThreadPool *thread_pool;
>  
> diff --git a/iothread.c b/iothread.c
> index da6ce7b..8f6d2e4 100644
> --- a/iothread.c
> +++ b/iothread.c
> @@ -30,7 +30,6 @@ typedef ObjectClass IOThreadClass;
>  static void *iothread_run(void *opaque)
>  {
>      IOThread *iothread = opaque;
> -    bool blocking;
>  
>      rcu_register_thread();
>  
> @@ -39,14 +38,8 @@ static void *iothread_run(void *opaque)
>      qemu_cond_signal(&iothread->init_done_cond);
>      qemu_mutex_unlock(&iothread->init_done_lock);
>  
> -    while (!iothread->stopping) {
> -        aio_context_acquire(iothread->ctx);
> -        blocking = true;
> -        while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) {
> -            /* Progress was made, keep going */
> -            blocking = false;
> -        }
> -        aio_context_release(iothread->ctx);
> +    while (!atomic_read(&iothread->stopping)) {
> +        aio_poll(iothread->ctx, true);
>      }
>  
>      rcu_unregister_thread();
> diff --git a/tests/test-aio.c b/tests/test-aio.c
> index 217e337..b4bd3f1 100644
> --- a/tests/test-aio.c
> +++ b/tests/test-aio.c
> @@ -99,6 +99,7 @@ static void event_ready_cb(EventNotifier *e)
>  
>  typedef struct {
>      QemuMutex start_lock;
> +    EventNotifier notifier;
>      bool thread_acquired;
>  } AcquireTestData;
>  
> @@ -110,6 +111,8 @@ static void *test_acquire_thread(void *opaque)
>      qemu_mutex_lock(&data->start_lock);
>      qemu_mutex_unlock(&data->start_lock);
>  
> +    g_usleep(500000);
> +    event_notifier_set(&data->notifier);
>      aio_context_acquire(ctx);
>      aio_context_release(ctx);
>  
> @@ -118,20 +121,19 @@ static void *test_acquire_thread(void *opaque)
>      return NULL;
>  }
>  
> -static void dummy_notifier_read(EventNotifier *unused)
> +static void dummy_notifier_read(EventNotifier *n)
>  {
> -    g_assert(false); /* should never be invoked */
> +    event_notifier_test_and_clear(n);
>  }
>  
>  static void test_acquire(void)
>  {
>      QemuThread thread;
> -    EventNotifier notifier;
>      AcquireTestData data;
>  
>      /* Dummy event notifier ensures aio_poll() will block */
> -    event_notifier_init(&notifier, false);
> -    aio_set_event_notifier(ctx, &notifier, dummy_notifier_read);
> +    event_notifier_init(&data.notifier, false);
> +    aio_set_event_notifier(ctx, &data.notifier, dummy_notifier_read);
>      g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */
>  
>      qemu_mutex_init(&data.start_lock);
> @@ -145,12 +147,13 @@ static void test_acquire(void)
>      /* Block in aio_poll(), let other thread kick us and acquire context */
>      aio_context_acquire(ctx);
>      qemu_mutex_unlock(&data.start_lock); /* let the thread run */
> -    g_assert(!aio_poll(ctx, true));
> +    g_assert(aio_poll(ctx, true));
> +    g_assert(!data.thread_acquired);
>      aio_context_release(ctx);
>  
>      qemu_thread_join(&thread);
> -    aio_set_event_notifier(ctx, &notifier, NULL);
> -    event_notifier_cleanup(&notifier);
> +    aio_set_event_notifier(ctx, &data.notifier, NULL);
> +    event_notifier_cleanup(&data.notifier);
>  
>      g_assert(data.thread_acquired);
>  }
> -- 
> 2.4.3
> 
>
Paolo Bonzini Sept. 9, 2015, 7:31 a.m. UTC | #2
On 09/09/2015 08:06, Fam Zheng wrote:
> This patch will probably go in before aio_disable_clients,

Actually I think it's blocked by aio_disable_clients.

> if any, but I'm not
> quite confident about the interface yet: listing a precise set of clients from
> monitor is an ugly coupling between modules:
> 
>     aio_disable_clients(bdrv_get_aio_context(bs),
>                         AIO_CLIENT_NBD | AIO_CLIENT_IOEVENTFD | AIO_CLIENT_FOO);
> 
>     ...
> 
>     aio_enble_clients(bdrv_get_aio_context(bs),
>                       AIO_CLIENT_NBD | AIO_CLIENT_IOEVENTFD | AIO_CLIENT_FOO);
> 
> I prefer at least an abstraction:
> 
>     bdrv_quiesce_begin(bs);
> 
>     ...
> 
>     bdrv_quiesce_end(bs);

That's fine.

Paolo

> My point is maybe we should leave documenting this to whichever series that
> introduces it?
Stefan Hajnoczi Sept. 28, 2015, 9:50 a.m. UTC | #3
On Thu, Aug 06, 2015 at 03:35:59PM +0200, Paolo Bonzini wrote:
> This is the first step towards having fine-grained critical sections in
> dataplane threads, which resolves lock ordering problems between
> address_space_* functions (which need the BQL when doing MMIO, even
> after we complete RCU-based dispatch) and the AioContext.
> 
> Because AioContext does not use contention callbacks anymore, the
> unit test has to be changed.
> 
> Previously applied as a0710f7995f914e3044e5899bd8ff6c43c62f916 and
> then reverted.

commit da5e1de95bb235330d7724316e7a29239d1359d5
Author: Stefan Hajnoczi <stefanha@redhat.com>
Date:   Wed Jun 3 10:15:33 2015 +0100

    Revert "iothread: release iothread around aio_poll"
    
    This reverts commit a0710f7995f914e3044e5899bd8ff6c43c62f916.
    
    In qemu-devel email message <556DBF87.2020908@de.ibm.com>, Christian
    Borntraeger writes:
    
      Having many guests all with a kernel/ramdisk (via -kernel) and
      several null block devices will result in hangs. All hanging
      guests are in partition detection code waiting for an I/O to return
      so very early maybe even the first I/O.
    
      Reverting that commit "fixes" the hangs.
    
    Reverting this commit for the 2.4 release.  More time is needed to
    investigate and correct this patch.

Did we ever find the root cause for hangs caused by this patch?
Paolo Bonzini Sept. 28, 2015, 10:14 a.m. UTC | #4
On 28/09/2015 11:50, Stefan Hajnoczi wrote:
> On Thu, Aug 06, 2015 at 03:35:59PM +0200, Paolo Bonzini wrote:
>> This is the first step towards having fine-grained critical sections in
>> dataplane threads, which resolves lock ordering problems between
>> address_space_* functions (which need the BQL when doing MMIO, even
>> after we complete RCU-based dispatch) and the AioContext.
>>
>> Because AioContext does not use contention callbacks anymore, the
>> unit test has to be changed.
>>
>> Previously applied as a0710f7995f914e3044e5899bd8ff6c43c62f916 and
>> then reverted.
> 
> commit da5e1de95bb235330d7724316e7a29239d1359d5
> Author: Stefan Hajnoczi <stefanha@redhat.com>
> Date:   Wed Jun 3 10:15:33 2015 +0100
> 
>     Revert "iothread: release iothread around aio_poll"
>     
>     This reverts commit a0710f7995f914e3044e5899bd8ff6c43c62f916.
>     
>     In qemu-devel email message <556DBF87.2020908@de.ibm.com>, Christian
>     Borntraeger writes:
>     
>       Having many guests all with a kernel/ramdisk (via -kernel) and
>       several null block devices will result in hangs. All hanging
>       guests are in partition detection code waiting for an I/O to return
>       so very early maybe even the first I/O.
>     
>       Reverting that commit "fixes" the hangs.
>     
>     Reverting this commit for the 2.4 release.  More time is needed to
>     investigate and correct this patch.
> 
> Did we ever find the root cause for hangs caused by this patch?

It was fixed by commit 53ec73e ("block: Use bdrv_drain to replace
uncessary bdrv_drain_all", 2015-05-29)'s change to bdrv_set_aio_context.
 We never investigated the root cause, but I'd guess it's gone after the
2.4-rc bugfixes to AioContext.

Paolo
diff mbox

Patch

diff --git a/async.c b/async.c
index efce14b..f992914 100644
--- a/async.c
+++ b/async.c
@@ -79,8 +79,8 @@  int aio_bh_poll(AioContext *ctx)
          * aio_notify again if necessary.
          */
         if (!bh->deleted && atomic_xchg(&bh->scheduled, 0)) {
-            /* Idle BHs and the notify BH don't count as progress */
-            if (!bh->idle && bh != ctx->notify_dummy_bh) {
+            /* Idle BHs don't count as progress */
+            if (!bh->idle) {
                 ret = 1;
             }
             bh->idle = 0;
@@ -232,7 +232,6 @@  aio_ctx_finalize(GSource     *source)
 {
     AioContext *ctx = (AioContext *) source;
 
-    qemu_bh_delete(ctx->notify_dummy_bh);
     thread_pool_free(ctx->thread_pool);
 
     qemu_mutex_lock(&ctx->bh_lock);
@@ -299,19 +298,6 @@  static void aio_timerlist_notify(void *opaque)
     aio_notify(opaque);
 }
 
-static void aio_rfifolock_cb(void *opaque)
-{
-    AioContext *ctx = opaque;
-
-    /* Kick owner thread in case they are blocked in aio_poll() */
-    qemu_bh_schedule(ctx->notify_dummy_bh);
-}
-
-static void notify_dummy_bh(void *opaque)
-{
-    /* Do nothing, we were invoked just to force the event loop to iterate */
-}
-
 static void event_notifier_dummy_cb(EventNotifier *e)
 {
 }
@@ -333,11 +319,9 @@  AioContext *aio_context_new(Error **errp)
                            event_notifier_dummy_cb);
     ctx->thread_pool = NULL;
     qemu_mutex_init(&ctx->bh_lock);
-    rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
+    rfifolock_init(&ctx->lock, NULL, NULL);
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
-    ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);
-
     return ctx;
 }
 
diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt
index 40b8419..723cc7e 100644
--- a/docs/multiple-iothreads.txt
+++ b/docs/multiple-iothreads.txt
@@ -105,13 +105,10 @@  a BH in the target AioContext beforehand and then call qemu_bh_schedule().  No
 acquire/release or locking is needed for the qemu_bh_schedule() call.  But be
 sure to acquire the AioContext for aio_bh_new() if necessary.
 
-The relationship between AioContext and the block layer
--------------------------------------------------------
-The AioContext originates from the QEMU block layer because it provides a
-scoped way of running event loop iterations until all work is done.  This
-feature is used to complete all in-flight block I/O requests (see
-bdrv_drain_all()).  Nowadays AioContext is a generic event loop that can be
-used by any QEMU subsystem.
+AioContext and the block layer
+------------------------------
+The AioContext originates from the QEMU block layer, even though nowadays
+AioContext is a generic event loop that can be used by any QEMU subsystem.
 
 The block layer has support for AioContext integrated.  Each BlockDriverState
 is associated with an AioContext using bdrv_set_aio_context() and
@@ -122,9 +119,17 @@  Block layer code must therefore expect to run in an IOThread and avoid using
 old APIs that implicitly use the main loop.  See the "How to program for
 IOThreads" above for information on how to do that.
 
-If main loop code such as a QMP function wishes to access a BlockDriverState it
-must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the
-IOThread does not run in parallel.
+If main loop code such as a QMP function wishes to access a BlockDriverState
+it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
+that callbacks in the IOThread do not run in parallel.
+
+Code running in the monitor typically uses bdrv_drain() to ensure that
+past requests from the guest are completed.  When a block device is
+running in an IOThread, the IOThread can also process requests from
+the guest (via ioeventfd).  These requests have to be blocked with
+aio_disable_clients() before calling bdrv_drain().  You can then reenable
+guest requests with aio_enable_clients() before finally releasing the
+AioContext and completing the monitor command.
 
 Long-running jobs (usually in the form of coroutines) are best scheduled in the
 BlockDriverState's AioContext to avoid the need to acquire/release around each
diff --git a/include/block/aio.h b/include/block/aio.h
index 400b1b0..9dd32e0 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -114,9 +114,6 @@  struct AioContext {
     bool notified;
     EventNotifier notifier;
 
-    /* Scheduling this BH forces the event loop it iterate */
-    QEMUBH *notify_dummy_bh;
-
     /* Thread pool for performing work and receiving completion callbacks */
     struct ThreadPool *thread_pool;
 
diff --git a/iothread.c b/iothread.c
index da6ce7b..8f6d2e4 100644
--- a/iothread.c
+++ b/iothread.c
@@ -30,7 +30,6 @@  typedef ObjectClass IOThreadClass;
 static void *iothread_run(void *opaque)
 {
     IOThread *iothread = opaque;
-    bool blocking;
 
     rcu_register_thread();
 
@@ -39,14 +38,8 @@  static void *iothread_run(void *opaque)
     qemu_cond_signal(&iothread->init_done_cond);
     qemu_mutex_unlock(&iothread->init_done_lock);
 
-    while (!iothread->stopping) {
-        aio_context_acquire(iothread->ctx);
-        blocking = true;
-        while (!iothread->stopping && aio_poll(iothread->ctx, blocking)) {
-            /* Progress was made, keep going */
-            blocking = false;
-        }
-        aio_context_release(iothread->ctx);
+    while (!atomic_read(&iothread->stopping)) {
+        aio_poll(iothread->ctx, true);
     }
 
     rcu_unregister_thread();
diff --git a/tests/test-aio.c b/tests/test-aio.c
index 217e337..b4bd3f1 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -99,6 +99,7 @@  static void event_ready_cb(EventNotifier *e)
 
 typedef struct {
     QemuMutex start_lock;
+    EventNotifier notifier;
     bool thread_acquired;
 } AcquireTestData;
 
@@ -110,6 +111,8 @@  static void *test_acquire_thread(void *opaque)
     qemu_mutex_lock(&data->start_lock);
     qemu_mutex_unlock(&data->start_lock);
 
+    g_usleep(500000);
+    event_notifier_set(&data->notifier);
     aio_context_acquire(ctx);
     aio_context_release(ctx);
 
@@ -118,20 +121,19 @@  static void *test_acquire_thread(void *opaque)
     return NULL;
 }
 
-static void dummy_notifier_read(EventNotifier *unused)
+static void dummy_notifier_read(EventNotifier *n)
 {
-    g_assert(false); /* should never be invoked */
+    event_notifier_test_and_clear(n);
 }
 
 static void test_acquire(void)
 {
     QemuThread thread;
-    EventNotifier notifier;
     AcquireTestData data;
 
     /* Dummy event notifier ensures aio_poll() will block */
-    event_notifier_init(&notifier, false);
-    aio_set_event_notifier(ctx, &notifier, dummy_notifier_read);
+    event_notifier_init(&data.notifier, false);
+    aio_set_event_notifier(ctx, &data.notifier, dummy_notifier_read);
     g_assert(!aio_poll(ctx, false)); /* consume aio_notify() */
 
     qemu_mutex_init(&data.start_lock);
@@ -145,12 +147,13 @@  static void test_acquire(void)
     /* Block in aio_poll(), let other thread kick us and acquire context */
     aio_context_acquire(ctx);
     qemu_mutex_unlock(&data.start_lock); /* let the thread run */
-    g_assert(!aio_poll(ctx, true));
+    g_assert(aio_poll(ctx, true));
+    g_assert(!data.thread_acquired);
     aio_context_release(ctx);
 
     qemu_thread_join(&thread);
-    aio_set_event_notifier(ctx, &notifier, NULL);
-    event_notifier_cleanup(&notifier);
+    aio_set_event_notifier(ctx, &data.notifier, NULL);
+    event_notifier_cleanup(&data.notifier);
 
     g_assert(data.thread_acquired);
 }