diff mbox

[v2,5/7] coroutine: rewrite pool to avoid mutex

Message ID 1417518350-6167-6-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Dec. 2, 2014, 11:05 a.m. UTC
This patch removes the mutex by using fancy lock-free manipulation of
the pool.  Lock-free stacks and queues are not hard, but they can suffer
from the ABA problem so they are better avoided unless you have some
deferred reclamation scheme like RCU.  Otherwise you have to stick
with adding to a list, and emptying it completely.  This is what this
patch does, by coupling a lock-free global list of available coroutines
with per-CPU lists that are actually used on coroutine creation.

Whenever the destruction pool is big enough, the next thread that runs
out of coroutines will steal the whole destruction pool.  This is positive
in two ways:

1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage.  Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg.  Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.

A later patch will also remove atomic operations from the release path,
and try to avoid the atomic_xchg altogether---succeeding in doing so if
all devices either use ioeventfd or are not submitting requests actively.

2) in theory this should be completely adaptive.  The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also empties qemu_coroutine_adjust_pool_size.  (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous.  But if you actually have many high-iodepth disks, it's better
to put them in different iothreads, which will also use separate thread
pools and aio=native file descriptors).

This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
No matter if we end with some kind of coroutine bypass scheme or not,
it cannot hurt to optimize hot code.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
v1->v2: leave personal opinions out of commit messages :) [Kevin]

 qemu-coroutine.c | 93 +++++++++++++++++++++++++-------------------------------
 1 file changed, 42 insertions(+), 51 deletions(-)

Comments

Peter Lieven Dec. 2, 2014, 12:09 p.m. UTC | #1
On 02.12.2014 12:05, Paolo Bonzini wrote:
> This patch removes the mutex by using fancy lock-free manipulation of
> the pool.  Lock-free stacks and queues are not hard, but they can suffer
> from the ABA problem so they are better avoided unless you have some
> deferred reclamation scheme like RCU.  Otherwise you have to stick
> with adding to a list, and emptying it completely.  This is what this
> patch does, by coupling a lock-free global list of available coroutines
> with per-CPU lists that are actually used on coroutine creation.
>
> Whenever the destruction pool is big enough, the next thread that runs
> out of coroutines will steal the whole destruction pool.  This is positive
> in two ways:
>
> 1) the allocation does not have to do any atomic operation in the fast
> path, it's entirely using thread-local storage.  Once every POOL_BATCH_SIZE
> allocations it will do a single atomic_xchg.  Release does an atomic_cmpxchg
> loop, that hopefully doesn't cause any starvation, and an atomic_inc.
>
> A later patch will also remove atomic operations from the release path,
> and try to avoid the atomic_xchg altogether---succeeding in doing so if
> all devices either use ioeventfd or are not submitting requests actively.
>
> 2) in theory this should be completely adaptive.  The number of coroutines
> around should be a little more than POOL_BATCH_SIZE * number of allocating
> threads; so this also empties qemu_coroutine_adjust_pool_size.  (The previous
> pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
> more generous.  But if you actually have many high-iodepth disks, it's better
> to put them in different iothreads, which will also use separate thread
> pools and aio=native file descriptors).
>
> This speeds up perf/cost (in tests/test-coroutine) by a factor of ~1.33.
> No matter if we end with some kind of coroutine bypass scheme or not,
> it cannot hurt to optimize hot code.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> v1->v2: leave personal opinions out of commit messages :) [Kevin]
>
>   qemu-coroutine.c | 93 +++++++++++++++++++++++++-------------------------------
>   1 file changed, 42 insertions(+), 51 deletions(-)
>
> diff --git a/qemu-coroutine.c b/qemu-coroutine.c
> index bd574aa..aee1017 100644
> --- a/qemu-coroutine.c
> +++ b/qemu-coroutine.c
> @@ -15,31 +15,57 @@
>   #include "trace.h"
>   #include "qemu-common.h"
>   #include "qemu/thread.h"
> +#include "qemu/atomic.h"
>   #include "block/coroutine.h"
>   #include "block/coroutine_int.h"
>   
>   enum {
> -    POOL_DEFAULT_SIZE = 64,
> +    POOL_BATCH_SIZE = 64,
>   };
>   
>   /** Free list to speed up creation */
> -static QemuMutex pool_lock;
> -static QSLIST_HEAD(, Coroutine) pool = QSLIST_HEAD_INITIALIZER(pool);
> -static unsigned int pool_size;
> -static unsigned int pool_max_size = POOL_DEFAULT_SIZE;
> +static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
> +static unsigned int release_pool_size;
> +static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
> +static __thread Notifier coroutine_pool_cleanup_notifier;
> +
> +static void coroutine_pool_cleanup(Notifier *n, void *value)
> +{
> +    Coroutine *co;
> +    Coroutine *tmp;
> +
> +    QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
> +        QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
> +        qemu_coroutine_delete(co);
> +    }
> +}
>   
>   Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>   {
>       Coroutine *co = NULL;
>   
>       if (CONFIG_COROUTINE_POOL) {
> -        qemu_mutex_lock(&pool_lock);
> -        co = QSLIST_FIRST(&pool);
> +        co = QSLIST_FIRST(&alloc_pool);
> +        if (!co) {
> +            if (release_pool_size > POOL_BATCH_SIZE) {
> +                /* Slow path; a good place to register the destructor, too.  */
> +                if (!coroutine_pool_cleanup_notifier.notify) {
> +                    coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
> +                    qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
> +                }
> +
> +                /* This is not exact; there could be a little skew between
> +                 * release_pool_size and the actual size of release_pool.  But
> +                 * it is just a heuristic, it does not need to be perfect.
> +                 */
> +                release_pool_size = 0;
> +                QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
> +                co = QSLIST_FIRST(&alloc_pool);
> +            }
> +        }
>           if (co) {
> -            QSLIST_REMOVE_HEAD(&pool, pool_next);
> -            pool_size--;
> +            QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
>           }
> -        qemu_mutex_unlock(&pool_lock);
>       }
>   
>       if (!co) {
> @@ -53,39 +80,19 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
>   
>   static void coroutine_delete(Coroutine *co)
>   {
> +    co->caller = NULL;
> +
>       if (CONFIG_COROUTINE_POOL) {
> -        qemu_mutex_lock(&pool_lock);
> -        if (pool_size < pool_max_size) {
> -            QSLIST_INSERT_HEAD(&pool, co, pool_next);
> -            co->caller = NULL;
> -            pool_size++;
> -            qemu_mutex_unlock(&pool_lock);
> +        if (release_pool_size < POOL_BATCH_SIZE * 2) {
> +            QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
> +            atomic_inc(&release_pool_size);
>               return;
>           }
> -        qemu_mutex_unlock(&pool_lock);
>       }
>   
>       qemu_coroutine_delete(co);
>   }
>   
> -static void __attribute__((constructor)) coroutine_pool_init(void)
> -{
> -    qemu_mutex_init(&pool_lock);
> -}
> -
> -static void __attribute__((destructor)) coroutine_pool_cleanup(void)
> -{
> -    Coroutine *co;
> -    Coroutine *tmp;
> -
> -    QSLIST_FOREACH_SAFE(co, &pool, pool_next, tmp) {
> -        QSLIST_REMOVE_HEAD(&pool, pool_next);
> -        qemu_coroutine_delete(co);
> -    }
> -
> -    qemu_mutex_destroy(&pool_lock);
> -}
> -

I still feel we should leave this destructor in to clean up the release_pool.

Peter
Paolo Bonzini Dec. 2, 2014, 12:13 p.m. UTC | #2
On 02/12/2014 13:09, Peter Lieven wrote:
>>
>> -static void __attribute__((destructor)) coroutine_pool_cleanup(void)
>> -{
>> -    Coroutine *co;
>> -    Coroutine *tmp;
>> -
>> -    QSLIST_FOREACH_SAFE(co, &pool, pool_next, tmp) {
>> -        QSLIST_REMOVE_HEAD(&pool, pool_next);
>> -        qemu_coroutine_delete(co);
>> -    }
>> -
>> -    qemu_mutex_destroy(&pool_lock);
>> -}
>> -
> 
> I still feel we should leave this destructor in to clean up the
> release_pool.

Why?  If you run QEMU under valgrind, there are thousands of blocks that
we do not free.  Stefan/Kevin, what do you think?

Paolo
Peter Lieven Dec. 2, 2014, 12:18 p.m. UTC | #3
On 02.12.2014 13:13, Paolo Bonzini wrote:
>
> On 02/12/2014 13:09, Peter Lieven wrote:
>>> -static void __attribute__((destructor)) coroutine_pool_cleanup(void)
>>> -{
>>> -    Coroutine *co;
>>> -    Coroutine *tmp;
>>> -
>>> -    QSLIST_FOREACH_SAFE(co, &pool, pool_next, tmp) {
>>> -        QSLIST_REMOVE_HEAD(&pool, pool_next);
>>> -        qemu_coroutine_delete(co);
>>> -    }
>>> -
>>> -    qemu_mutex_destroy(&pool_lock);
>>> -}
>>> -
>> I still feel we should leave this destructor in to clean up the
>> release_pool.
> Why?  If you run QEMU under valgrind, there are thousands of blocks that
> we do not free.  Stefan/Kevin, what do you think?

Before this patch we cleaned up this part at least.
I have learned that it bad style not to clean up your resources.
Just because other code parts do not do it we should not introduce
new parts that don't it.

Peter
Paolo Bonzini Dec. 2, 2014, 12:32 p.m. UTC | #4
On 02/12/2014 13:18, Peter Lieven wrote:
> On 02.12.2014 13:13, Paolo Bonzini wrote:
>>
>> On 02/12/2014 13:09, Peter Lieven wrote:
>>>> -static void __attribute__((destructor)) coroutine_pool_cleanup(void)
>>>> -{
>>>> -    Coroutine *co;
>>>> -    Coroutine *tmp;
>>>> -
>>>> -    QSLIST_FOREACH_SAFE(co, &pool, pool_next, tmp) {
>>>> -        QSLIST_REMOVE_HEAD(&pool, pool_next);
>>>> -        qemu_coroutine_delete(co);
>>>> -    }
>>>> -
>>>> -    qemu_mutex_destroy(&pool_lock);
>>>> -}
>>>> -
>>> I still feel we should leave this destructor in to clean up the
>>> release_pool.
>> Why?  If you run QEMU under valgrind, there are thousands of blocks that
>> we do not free.  Stefan/Kevin, what do you think?
> 
> Before this patch we cleaned up this part at least.
> I have learned that it bad style not to clean up your resources.
> Just because other code parts do not do it we should not introduce
> new parts that don't it.

Which other parts do we cleanup?  For example file descriptors are not
cleaned up, much less most memory; the kernel is there to do it for us.
 I think it's up to the maintainers to decide.

Paolo
Kevin Wolf Dec. 2, 2014, 1:04 p.m. UTC | #5
Am 02.12.2014 um 13:13 hat Paolo Bonzini geschrieben:
> 
> 
> On 02/12/2014 13:09, Peter Lieven wrote:
> >>
> >> -static void __attribute__((destructor)) coroutine_pool_cleanup(void)
> >> -{
> >> -    Coroutine *co;
> >> -    Coroutine *tmp;
> >> -
> >> -    QSLIST_FOREACH_SAFE(co, &pool, pool_next, tmp) {
> >> -        QSLIST_REMOVE_HEAD(&pool, pool_next);
> >> -        qemu_coroutine_delete(co);
> >> -    }
> >> -
> >> -    qemu_mutex_destroy(&pool_lock);
> >> -}
> >> -
> > 
> > I still feel we should leave this destructor in to clean up the
> > release_pool.
> 
> Why?  If you run QEMU under valgrind, there are thousands of blocks that
> we do not free.  Stefan/Kevin, what do you think?

The destructor doesn't seem to be doing anything but freeing memory,
which the OS can indeed do for us. I don't mind either way.

Kevin
diff mbox

Patch

diff --git a/qemu-coroutine.c b/qemu-coroutine.c
index bd574aa..aee1017 100644
--- a/qemu-coroutine.c
+++ b/qemu-coroutine.c
@@ -15,31 +15,57 @@ 
 #include "trace.h"
 #include "qemu-common.h"
 #include "qemu/thread.h"
+#include "qemu/atomic.h"
 #include "block/coroutine.h"
 #include "block/coroutine_int.h"
 
 enum {
-    POOL_DEFAULT_SIZE = 64,
+    POOL_BATCH_SIZE = 64,
 };
 
 /** Free list to speed up creation */
-static QemuMutex pool_lock;
-static QSLIST_HEAD(, Coroutine) pool = QSLIST_HEAD_INITIALIZER(pool);
-static unsigned int pool_size;
-static unsigned int pool_max_size = POOL_DEFAULT_SIZE;
+static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
+static unsigned int release_pool_size;
+static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
+static __thread Notifier coroutine_pool_cleanup_notifier;
+
+static void coroutine_pool_cleanup(Notifier *n, void *value)
+{
+    Coroutine *co;
+    Coroutine *tmp;
+
+    QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
+        QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
+        qemu_coroutine_delete(co);
+    }
+}
 
 Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
 {
     Coroutine *co = NULL;
 
     if (CONFIG_COROUTINE_POOL) {
-        qemu_mutex_lock(&pool_lock);
-        co = QSLIST_FIRST(&pool);
+        co = QSLIST_FIRST(&alloc_pool);
+        if (!co) {
+            if (release_pool_size > POOL_BATCH_SIZE) {
+                /* Slow path; a good place to register the destructor, too.  */
+                if (!coroutine_pool_cleanup_notifier.notify) {
+                    coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
+                    qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
+                }
+
+                /* This is not exact; there could be a little skew between
+                 * release_pool_size and the actual size of release_pool.  But
+                 * it is just a heuristic, it does not need to be perfect.
+                 */
+                release_pool_size = 0;
+                QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
+                co = QSLIST_FIRST(&alloc_pool);
+            }
+        }
         if (co) {
-            QSLIST_REMOVE_HEAD(&pool, pool_next);
-            pool_size--;
+            QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
         }
-        qemu_mutex_unlock(&pool_lock);
     }
 
     if (!co) {
@@ -53,39 +80,19 @@  Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
 
 static void coroutine_delete(Coroutine *co)
 {
+    co->caller = NULL;
+
     if (CONFIG_COROUTINE_POOL) {
-        qemu_mutex_lock(&pool_lock);
-        if (pool_size < pool_max_size) {
-            QSLIST_INSERT_HEAD(&pool, co, pool_next);
-            co->caller = NULL;
-            pool_size++;
-            qemu_mutex_unlock(&pool_lock);
+        if (release_pool_size < POOL_BATCH_SIZE * 2) {
+            QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
+            atomic_inc(&release_pool_size);
             return;
         }
-        qemu_mutex_unlock(&pool_lock);
     }
 
     qemu_coroutine_delete(co);
 }
 
-static void __attribute__((constructor)) coroutine_pool_init(void)
-{
-    qemu_mutex_init(&pool_lock);
-}
-
-static void __attribute__((destructor)) coroutine_pool_cleanup(void)
-{
-    Coroutine *co;
-    Coroutine *tmp;
-
-    QSLIST_FOREACH_SAFE(co, &pool, pool_next, tmp) {
-        QSLIST_REMOVE_HEAD(&pool, pool_next);
-        qemu_coroutine_delete(co);
-    }
-
-    qemu_mutex_destroy(&pool_lock);
-}
-
 static void coroutine_swap(Coroutine *from, Coroutine *to)
 {
     CoroutineAction ret;
@@ -140,20 +147,4 @@  void coroutine_fn qemu_coroutine_yield(void)
 
 void qemu_coroutine_adjust_pool_size(int n)
 {
-    qemu_mutex_lock(&pool_lock);
-
-    pool_max_size += n;
-
-    /* Callers should never take away more than they added */
-    assert(pool_max_size >= POOL_DEFAULT_SIZE);
-
-    /* Trim oversized pool down to new max */
-    while (pool_size > pool_max_size) {
-        Coroutine *co = QSLIST_FIRST(&pool);
-        QSLIST_REMOVE_HEAD(&pool, pool_next);
-        pool_size--;
-        qemu_coroutine_delete(co);
-    }
-
-    qemu_mutex_unlock(&pool_lock);
 }