diff mbox

posix-aio-compat: fix latency issues

Message ID 1312803458-2272-1-git-send-email-avi@redhat.com
State New
Headers show

Commit Message

Avi Kivity Aug. 8, 2011, 11:37 a.m. UTC
In certain circumstances, posix-aio-compat can incur a lot of latency:
 - threads are created by vcpu threads, so if vcpu affinity is set,
   aio threads inherit vcpu affinity.  This can cause many aio threads
   to compete for one cpu.
 - we can create up to max_threads (64) aio threads in one go; since a
   pthread_create can take around 30μs, we have up to 2ms of cpu time
   under a global lock.

Fix by:
 - moving thread creation to the main thread, so we inherit the main
   thread's affinity instead of the vcpu thread's affinity.
 - if a thread is currently being created, and we need to create yet
   another thread, let thread being born create the new thread, reducing
   the amount of time we spend under the main thread.
 - drop the local lock while creating a thread (we may still hold the
   global mutex, though)

Note this doesn't eliminate latency completely; scheduler artifacts or
lack of host cpu resources can still cause it.  We may want pre-allocated
threads when this cannot be tolerated.

Thanks to Uli Obergfell of Red Hat for his excellent analysis and suggestions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 posix-aio-compat.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 46 insertions(+), 2 deletions(-)

Comments

Anthony Liguori Aug. 8, 2011, 12:34 p.m. UTC | #1
On 08/08/2011 06:37 AM, Avi Kivity wrote:
> In certain circumstances, posix-aio-compat can incur a lot of latency:
>   - threads are created by vcpu threads, so if vcpu affinity is set,
>     aio threads inherit vcpu affinity.  This can cause many aio threads
>     to compete for one cpu.
>   - we can create up to max_threads (64) aio threads in one go; since a
>     pthread_create can take around 30μs, we have up to 2ms of cpu time
>     under a global lock.
>
> Fix by:
>   - moving thread creation to the main thread, so we inherit the main
>     thread's affinity instead of the vcpu thread's affinity.
>   - if a thread is currently being created, and we need to create yet
>     another thread, let thread being born create the new thread, reducing
>     the amount of time we spend under the main thread.
>   - drop the local lock while creating a thread (we may still hold the
>     global mutex, though)
>
> Note this doesn't eliminate latency completely; scheduler artifacts or
> lack of host cpu resources can still cause it.  We may want pre-allocated
> threads when this cannot be tolerated.
>
> Thanks to Uli Obergfell of Red Hat for his excellent analysis and suggestions.

Do you have a scenario where you can measure the benefits of this 
change?  The idle time in the thread pool is rather large, it surprises 
me that it'd be an issue in practice.

Regards,

Anthony Liguori

>
> Signed-off-by: Avi Kivity<avi@redhat.com>
> ---
>   posix-aio-compat.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++--
>   1 files changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/posix-aio-compat.c b/posix-aio-compat.c
> index 8dc00cb..aa30673 100644
> --- a/posix-aio-compat.c
> +++ b/posix-aio-compat.c
> @@ -30,6 +30,7 @@
>
>   #include "block/raw-posix-aio.h"
>
> +static void do_spawn_thread(void);
>
>   struct qemu_paiocb {
>       BlockDriverAIOCB common;
> @@ -64,6 +65,9 @@ static pthread_attr_t attr;
>   static int max_threads = 64;
>   static int cur_threads = 0;
>   static int idle_threads = 0;
> +static int new_threads = 0;     /* backlog of threads we need to create */
> +static int pending_threads = 0; /* threads created but not running yet */
> +static QEMUBH *new_thread_bh;
>   static QTAILQ_HEAD(, qemu_paiocb) request_list;
>
>   #ifdef CONFIG_PREADV
> @@ -311,6 +315,13 @@ static void *aio_thread(void *unused)
>
>       pid = getpid();
>
> +    mutex_lock(&lock);
> +    if (new_threads) {
> +        do_spawn_thread();
> +    }
> +    pending_threads--;
> +    mutex_unlock(&lock);
> +
>       while (1) {
>           struct qemu_paiocb *aiocb;
>           ssize_t ret = 0;
> @@ -381,11 +392,18 @@ static void *aio_thread(void *unused)
>       return NULL;
>   }
>
> -static void spawn_thread(void)
> +static void do_spawn_thread(void)
>   {
>       sigset_t set, oldset;
>
> -    cur_threads++;
> +    if (!new_threads) {
> +        return;
> +    }
> +
> +    new_threads--;
> +    pending_threads++;
> +
> +    mutex_unlock(&lock);
>
>       /* block all signals */
>       if (sigfillset(&set)) die("sigfillset");
> @@ -394,6 +412,31 @@ static void spawn_thread(void)
>       thread_create(&thread_id,&attr, aio_thread, NULL);
>
>       if (sigprocmask(SIG_SETMASK,&oldset, NULL)) die("sigprocmask restore");
> +
> +    mutex_lock(&lock);
> +}
> +
> +static void spawn_thread_bh_fn(void *opaque)
> +{
> +    mutex_lock(&lock);
> +    do_spawn_thread();
> +    mutex_unlock(&lock);
> +}
> +
> +static void spawn_thread(void)
> +{
> +    cur_threads++;
> +    new_threads++;
> +    /* If there are threads being created, they will spawn new workers, so
> +     * we don't spend time creating many threads in a loop holding a mutex or
> +     * starving the current vcpu.
> +     *
> +     * If there are no idle threads, ask the main thread to create one, so we
> +     * inherit the correct affinity instead of the vcpu affinity.
> +     */
> +    if (!pending_threads) {
> +        qemu_bh_schedule(new_thread_bh);
> +    }
>   }
>
>   static void qemu_paio_submit(struct qemu_paiocb *aiocb)
> @@ -665,6 +708,7 @@ int paio_init(void)
>           die2(ret, "pthread_attr_setdetachstate");
>
>       QTAILQ_INIT(&request_list);
> +    new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
>
>       posix_aio_state = s;
>       return 0;
Avi Kivity Aug. 8, 2011, 12:42 p.m. UTC | #2
On 08/08/2011 03:34 PM, Anthony Liguori wrote:
> On 08/08/2011 06:37 AM, Avi Kivity wrote:
>> In certain circumstances, posix-aio-compat can incur a lot of latency:
>>   - threads are created by vcpu threads, so if vcpu affinity is set,
>>     aio threads inherit vcpu affinity.  This can cause many aio threads
>>     to compete for one cpu.
>>   - we can create up to max_threads (64) aio threads in one go; since a
>>     pthread_create can take around 30μs, we have up to 2ms of cpu time
>>     under a global lock.
>>
>> Fix by:
>>   - moving thread creation to the main thread, so we inherit the main
>>     thread's affinity instead of the vcpu thread's affinity.
>>   - if a thread is currently being created, and we need to create yet
>>     another thread, let thread being born create the new thread, 
>> reducing
>>     the amount of time we spend under the main thread.
>>   - drop the local lock while creating a thread (we may still hold the
>>     global mutex, though)
>>
>> Note this doesn't eliminate latency completely; scheduler artifacts or
>> lack of host cpu resources can still cause it.  We may want 
>> pre-allocated
>> threads when this cannot be tolerated.
>>
>> Thanks to Uli Obergfell of Red Hat for his excellent analysis and 
>> suggestions.
>
> Do you have a scenario where you can measure the benefits of this change? 

It's a customer scenario, so I can't share it.  Not that I know exactly 
what happened there in terms of workload.

> The idle time in the thread pool is rather large, it surprises me that 
> it'd be an issue in practice.
>

Just starting up a virtio guest will fill the queue with > max_threads 
requests, and if the vcpu is pinned, all 64 thread creations and 
executions will have to run on the same cpu, and will likely preempt the 
vcpu since it's classified as a "cpu hog" by some schedulers.
Frediano Ziglio Aug. 8, 2011, 12:49 p.m. UTC | #3
2011/8/8 Avi Kivity <avi@redhat.com>:
> In certain circumstances, posix-aio-compat can incur a lot of latency:
>  - threads are created by vcpu threads, so if vcpu affinity is set,
>   aio threads inherit vcpu affinity.  This can cause many aio threads
>   to compete for one cpu.
>  - we can create up to max_threads (64) aio threads in one go; since a
>   pthread_create can take around 30μs, we have up to 2ms of cpu time
>   under a global lock.
>
> Fix by:
>  - moving thread creation to the main thread, so we inherit the main
>   thread's affinity instead of the vcpu thread's affinity.
>  - if a thread is currently being created, and we need to create yet
>   another thread, let thread being born create the new thread, reducing
>   the amount of time we spend under the main thread.
>  - drop the local lock while creating a thread (we may still hold the
>   global mutex, though)
>
> Note this doesn't eliminate latency completely; scheduler artifacts or
> lack of host cpu resources can still cause it.  We may want pre-allocated
> threads when this cannot be tolerated.
>
> Thanks to Uli Obergfell of Red Hat for his excellent analysis and suggestions.
>
> Signed-off-by: Avi Kivity <avi@redhat.com>

Why not calling pthread_attr_setaffinity_np (where available) before
thread creation or shed_setaffinity at thread start instead of telling
another thread to create a thread for us just to get affinity cleared?

Regards
  Frediano

> ---
>  posix-aio-compat.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/posix-aio-compat.c b/posix-aio-compat.c
> index 8dc00cb..aa30673 100644
> --- a/posix-aio-compat.c
> +++ b/posix-aio-compat.c
> @@ -30,6 +30,7 @@
>
>  #include "block/raw-posix-aio.h"
>
> +static void do_spawn_thread(void);
>
>  struct qemu_paiocb {
>     BlockDriverAIOCB common;
> @@ -64,6 +65,9 @@ static pthread_attr_t attr;
>  static int max_threads = 64;
>  static int cur_threads = 0;
>  static int idle_threads = 0;
> +static int new_threads = 0;     /* backlog of threads we need to create */
> +static int pending_threads = 0; /* threads created but not running yet */
> +static QEMUBH *new_thread_bh;
>  static QTAILQ_HEAD(, qemu_paiocb) request_list;
>
>  #ifdef CONFIG_PREADV
> @@ -311,6 +315,13 @@ static void *aio_thread(void *unused)
>
>     pid = getpid();
>
> +    mutex_lock(&lock);
> +    if (new_threads) {
> +        do_spawn_thread();
> +    }
> +    pending_threads--;
> +    mutex_unlock(&lock);
> +
>     while (1) {
>         struct qemu_paiocb *aiocb;
>         ssize_t ret = 0;
> @@ -381,11 +392,18 @@ static void *aio_thread(void *unused)
>     return NULL;
>  }
>
> -static void spawn_thread(void)
> +static void do_spawn_thread(void)
>  {
>     sigset_t set, oldset;
>
> -    cur_threads++;
> +    if (!new_threads) {
> +        return;
> +    }
> +
> +    new_threads--;
> +    pending_threads++;
> +
> +    mutex_unlock(&lock);
>
>     /* block all signals */
>     if (sigfillset(&set)) die("sigfillset");
> @@ -394,6 +412,31 @@ static void spawn_thread(void)
>     thread_create(&thread_id, &attr, aio_thread, NULL);
>
>     if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore");
> +
> +    mutex_lock(&lock);
> +}
> +
> +static void spawn_thread_bh_fn(void *opaque)
> +{
> +    mutex_lock(&lock);
> +    do_spawn_thread();
> +    mutex_unlock(&lock);
> +}
> +
> +static void spawn_thread(void)
> +{
> +    cur_threads++;
> +    new_threads++;
> +    /* If there are threads being created, they will spawn new workers, so
> +     * we don't spend time creating many threads in a loop holding a mutex or
> +     * starving the current vcpu.
> +     *
> +     * If there are no idle threads, ask the main thread to create one, so we
> +     * inherit the correct affinity instead of the vcpu affinity.
> +     */
> +    if (!pending_threads) {
> +        qemu_bh_schedule(new_thread_bh);
> +    }
>  }
>
>  static void qemu_paio_submit(struct qemu_paiocb *aiocb)
> @@ -665,6 +708,7 @@ int paio_init(void)
>         die2(ret, "pthread_attr_setdetachstate");
>
>     QTAILQ_INIT(&request_list);
> +    new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
>
>     posix_aio_state = s;
>     return 0;
> --
> 1.7.5.3
>
>
>
Avi Kivity Aug. 8, 2011, 12:54 p.m. UTC | #4
On 08/08/2011 03:49 PM, Frediano Ziglio wrote:
> 2011/8/8 Avi Kivity<avi@redhat.com>:
> >  In certain circumstances, posix-aio-compat can incur a lot of latency:
> >    - threads are created by vcpu threads, so if vcpu affinity is set,
> >     aio threads inherit vcpu affinity.  This can cause many aio threads
> >     to compete for one cpu.
> >    - we can create up to max_threads (64) aio threads in one go; since a
> >     pthread_create can take around 30μs, we have up to 2ms of cpu time
> >     under a global lock.
> >
> >  Fix by:
> >    - moving thread creation to the main thread, so we inherit the main
> >     thread's affinity instead of the vcpu thread's affinity.
> >    - if a thread is currently being created, and we need to create yet
> >     another thread, let thread being born create the new thread, reducing
> >     the amount of time we spend under the main thread.
> >    - drop the local lock while creating a thread (we may still hold the
> >     global mutex, though)
> >
> >  Note this doesn't eliminate latency completely; scheduler artifacts or
> >  lack of host cpu resources can still cause it.  We may want pre-allocated
> >  threads when this cannot be tolerated.
> >
> >  Thanks to Uli Obergfell of Red Hat for his excellent analysis and suggestions.
> >
> >  Signed-off-by: Avi Kivity<avi@redhat.com>
>
> Why not calling pthread_attr_setaffinity_np (where available) before
> thread creation or shed_setaffinity at thread start instead of telling
> another thread to create a thread for us just to get affinity cleared?
>

The entire qemu process may be affined to a subset of the host cpus; we 
don't want to break that.

For example:

    taskset 0xf0 qemu ....
    (qemu) info cpus
<pin individual vcpu threads to host cpus>
Frediano Ziglio Aug. 8, 2011, 1:21 p.m. UTC | #5
2011/8/8 Avi Kivity <avi@redhat.com>:
> On 08/08/2011 03:49 PM, Frediano Ziglio wrote:
>>
>> 2011/8/8 Avi Kivity<avi@redhat.com>:
>> >  In certain circumstances, posix-aio-compat can incur a lot of latency:
>> >    - threads are created by vcpu threads, so if vcpu affinity is set,
>> >     aio threads inherit vcpu affinity.  This can cause many aio threads
>> >     to compete for one cpu.
>> >    - we can create up to max_threads (64) aio threads in one go; since a
>> >     pthread_create can take around 30μs, we have up to 2ms of cpu time
>> >     under a global lock.
>> >
>> >  Fix by:
>> >    - moving thread creation to the main thread, so we inherit the main
>> >     thread's affinity instead of the vcpu thread's affinity.
>> >    - if a thread is currently being created, and we need to create yet
>> >     another thread, let thread being born create the new thread,
>> > reducing
>> >     the amount of time we spend under the main thread.
>> >    - drop the local lock while creating a thread (we may still hold the
>> >     global mutex, though)
>> >
>> >  Note this doesn't eliminate latency completely; scheduler artifacts or
>> >  lack of host cpu resources can still cause it.  We may want
>> > pre-allocated
>> >  threads when this cannot be tolerated.
>> >
>> >  Thanks to Uli Obergfell of Red Hat for his excellent analysis and
>> > suggestions.
>> >
>> >  Signed-off-by: Avi Kivity<avi@redhat.com>
>>
>> Why not calling pthread_attr_setaffinity_np (where available) before
>> thread creation or shed_setaffinity at thread start instead of telling
>> another thread to create a thread for us just to get affinity cleared?
>>
>
> The entire qemu process may be affined to a subset of the host cpus; we
> don't want to break that.
>
> For example:
>
>   taskset 0xf0 qemu ....
>   (qemu) info cpus
> <pin individual vcpu threads to host cpus>
>
>

Just call sched_getaffinity at program start, save to a global
variable and then set this affinity for io threads.
I didn't use affinity that much but from manual it seems that if you
own process you can set affinity as you like.
IMHO this patch introduce a delay in io thread creation due to posting
thread creation to another thread just to set different affinity.

Frediano
Avi Kivity Aug. 8, 2011, 1:26 p.m. UTC | #6
On 08/08/2011 04:21 PM, Frediano Ziglio wrote:
> >
> >  The entire qemu process may be affined to a subset of the host cpus; we
> >  don't want to break that.
> >
> >  For example:
> >
> >     taskset 0xf0 qemu ....
> >     (qemu) info cpus
> >  <pin individual vcpu threads to host cpus>
> >
> >
>
> Just call sched_getaffinity at program start, save to a global
> variable and then set this affinity for io threads.

This affinity may change later on.

> I didn't use affinity that much but from manual it seems that if you
> own process you can set affinity as you like.
> IMHO this patch introduce a delay in io thread creation due to posting
> thread creation to another thread just to set different affinity.

It does.  But aio threads have a long life, so this happens very rarely.
Anthony Liguori Aug. 12, 2011, 1:24 p.m. UTC | #7
On 08/08/2011 06:37 AM, Avi Kivity wrote:
> In certain circumstances, posix-aio-compat can incur a lot of latency:
>   - threads are created by vcpu threads, so if vcpu affinity is set,
>     aio threads inherit vcpu affinity.  This can cause many aio threads
>     to compete for one cpu.
>   - we can create up to max_threads (64) aio threads in one go; since a
>     pthread_create can take around 30μs, we have up to 2ms of cpu time
>     under a global lock.
>
> Fix by:
>   - moving thread creation to the main thread, so we inherit the main
>     thread's affinity instead of the vcpu thread's affinity.
>   - if a thread is currently being created, and we need to create yet
>     another thread, let thread being born create the new thread, reducing
>     the amount of time we spend under the main thread.
>   - drop the local lock while creating a thread (we may still hold the
>     global mutex, though)
>
> Note this doesn't eliminate latency completely; scheduler artifacts or
> lack of host cpu resources can still cause it.  We may want pre-allocated
> threads when this cannot be tolerated.
>
> Thanks to Uli Obergfell of Red Hat for his excellent analysis and suggestions.
>
> Signed-off-by: Avi Kivity<avi@redhat.com>
> ---
>   posix-aio-compat.c |   48 ++++++++++++++++++++++++++++++++++++++++++++++--
>   1 files changed, 46 insertions(+), 2 deletions(-)
>
> diff --git a/posix-aio-compat.c b/posix-aio-compat.c
> index 8dc00cb..aa30673 100644
> --- a/posix-aio-compat.c
> +++ b/posix-aio-compat.c
> @@ -30,6 +30,7 @@
>
>   #include "block/raw-posix-aio.h"
>
> +static void do_spawn_thread(void);
>
>   struct qemu_paiocb {
>       BlockDriverAIOCB common;
> @@ -64,6 +65,9 @@ static pthread_attr_t attr;
>   static int max_threads = 64;
>   static int cur_threads = 0;
>   static int idle_threads = 0;
> +static int new_threads = 0;     /* backlog of threads we need to create */
> +static int pending_threads = 0; /* threads created but not running yet */
> +static QEMUBH *new_thread_bh;
>   static QTAILQ_HEAD(, qemu_paiocb) request_list;
>
>   #ifdef CONFIG_PREADV
> @@ -311,6 +315,13 @@ static void *aio_thread(void *unused)
>
>       pid = getpid();
>
> +    mutex_lock(&lock);
> +    if (new_threads) {
> +        do_spawn_thread();
> +    }
> +    pending_threads--;
> +    mutex_unlock(&lock);
> +
>       while (1) {
>           struct qemu_paiocb *aiocb;
>           ssize_t ret = 0;
> @@ -381,11 +392,18 @@ static void *aio_thread(void *unused)
>       return NULL;
>   }
>
> -static void spawn_thread(void)
> +static void do_spawn_thread(void)
>   {
>       sigset_t set, oldset;
>
> -    cur_threads++;
> +    if (!new_threads) {
> +        return;
> +    }
> +
> +    new_threads--;
> +    pending_threads++;
> +
> +    mutex_unlock(&lock);
>
>       /* block all signals */
>       if (sigfillset(&set)) die("sigfillset");
> @@ -394,6 +412,31 @@ static void spawn_thread(void)
>       thread_create(&thread_id,&attr, aio_thread, NULL);
>
>       if (sigprocmask(SIG_SETMASK,&oldset, NULL)) die("sigprocmask restore");
> +
> +    mutex_lock(&lock);
> +}
> +
> +static void spawn_thread_bh_fn(void *opaque)
> +{
> +    mutex_lock(&lock);
> +    do_spawn_thread();
> +    mutex_unlock(&lock);
> +}

The locking here is odd.  Why not call do_spawn_thread() without the 
lock, and acquire the lock for the section that needs to hold it?

Otherwise, the logic seems correct to me.

Kevin, could you also take a look at this patch?

Regards,

Anthony Liguori

> +
> +static void spawn_thread(void)
> +{
> +    cur_threads++;
> +    new_threads++;
> +    /* If there are threads being created, they will spawn new workers, so
> +     * we don't spend time creating many threads in a loop holding a mutex or
> +     * starving the current vcpu.
> +     *
> +     * If there are no idle threads, ask the main thread to create one, so we
> +     * inherit the correct affinity instead of the vcpu affinity.
> +     */
> +    if (!pending_threads) {
> +        qemu_bh_schedule(new_thread_bh);
> +    }
>   }
>
>   static void qemu_paio_submit(struct qemu_paiocb *aiocb)
> @@ -665,6 +708,7 @@ int paio_init(void)
>           die2(ret, "pthread_attr_setdetachstate");
>
>       QTAILQ_INIT(&request_list);
> +    new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
>
>       posix_aio_state = s;
>       return 0;
Avi Kivity Aug. 14, 2011, 3:43 a.m. UTC | #8
On 08/12/2011 06:24 AM, Anthony Liguori wrote:
> On 08/08/2011 06:37 AM, Avi Kivity wrote:
>>
>> +static void spawn_thread_bh_fn(void *opaque)
>> +{
>> +    mutex_lock(&lock);
>> +    do_spawn_thread();
>> +    mutex_unlock(&lock);
>> +}
>
> The locking here is odd.  Why not call do_spawn_thread() without the 
> lock, and acquire the lock for the section that needs to hold it?

Just the way the code evolved.  Note that aio_thread() does need to take 
the lock.  However, it is indeed cleaner to take the lock when needed 
rather than drop it when not.

>
> Otherwise, the logic seems correct to me.
>
> Kevin, could you also take a look at this patch?

Yes please.
diff mbox

Patch

diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index 8dc00cb..aa30673 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -30,6 +30,7 @@ 
 
 #include "block/raw-posix-aio.h"
 
+static void do_spawn_thread(void);
 
 struct qemu_paiocb {
     BlockDriverAIOCB common;
@@ -64,6 +65,9 @@  static pthread_attr_t attr;
 static int max_threads = 64;
 static int cur_threads = 0;
 static int idle_threads = 0;
+static int new_threads = 0;     /* backlog of threads we need to create */
+static int pending_threads = 0; /* threads created but not running yet */
+static QEMUBH *new_thread_bh;
 static QTAILQ_HEAD(, qemu_paiocb) request_list;
 
 #ifdef CONFIG_PREADV
@@ -311,6 +315,13 @@  static void *aio_thread(void *unused)
 
     pid = getpid();
 
+    mutex_lock(&lock);
+    if (new_threads) {
+        do_spawn_thread();
+    }
+    pending_threads--;
+    mutex_unlock(&lock);
+
     while (1) {
         struct qemu_paiocb *aiocb;
         ssize_t ret = 0;
@@ -381,11 +392,18 @@  static void *aio_thread(void *unused)
     return NULL;
 }
 
-static void spawn_thread(void)
+static void do_spawn_thread(void)
 {
     sigset_t set, oldset;
 
-    cur_threads++;
+    if (!new_threads) {
+        return;
+    }
+
+    new_threads--;
+    pending_threads++;
+
+    mutex_unlock(&lock);
 
     /* block all signals */
     if (sigfillset(&set)) die("sigfillset");
@@ -394,6 +412,31 @@  static void spawn_thread(void)
     thread_create(&thread_id, &attr, aio_thread, NULL);
 
     if (sigprocmask(SIG_SETMASK, &oldset, NULL)) die("sigprocmask restore");
+
+    mutex_lock(&lock);
+}
+
+static void spawn_thread_bh_fn(void *opaque)
+{
+    mutex_lock(&lock);
+    do_spawn_thread();
+    mutex_unlock(&lock);
+}
+
+static void spawn_thread(void)
+{
+    cur_threads++;
+    new_threads++;
+    /* If there are threads being created, they will spawn new workers, so
+     * we don't spend time creating many threads in a loop holding a mutex or
+     * starving the current vcpu.
+     *
+     * If there are no idle threads, ask the main thread to create one, so we
+     * inherit the correct affinity instead of the vcpu affinity.
+     */
+    if (!pending_threads) {
+        qemu_bh_schedule(new_thread_bh);
+    }
 }
 
 static void qemu_paio_submit(struct qemu_paiocb *aiocb)
@@ -665,6 +708,7 @@  int paio_init(void)
         die2(ret, "pthread_attr_setdetachstate");
 
     QTAILQ_INIT(&request_list);
+    new_thread_bh = qemu_bh_new(spawn_thread_bh_fn, NULL);
 
     posix_aio_state = s;
     return 0;