diff mbox

[RFC,2/3] raw-posix: Convert Linux AIO submission to coroutines

Message ID 54784412.2080700@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Nov. 28, 2014, 9:44 a.m. UTC
On 28/11/2014 03:59, Ming Lei wrote:
> Hi Kevin,
> 
> On Wed, Nov 26, 2014 at 10:46 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> This improves the performance of requests because an ACB doesn't need to
>> be allocated on the heap any more. It also makes the code nicer and
>> smaller.
> 
> I am not sure it is good way for linux aio optimization:
> 
> - for raw image with some constraint, coroutine can be avoided since
> io_submit() won't sleep most of times
> 
> - handling one time coroutine takes much time than handling malloc,
> memset and free on small buffer, following the test data:
> 
>          --   241ns per coroutine
>          --   61ns per (malloc, memset, free for 128bytes)
> 
> I still think we should figure out a fast path to avoid cocourinte
> for linux-aio with raw image, otherwise it can't scale well for high
> IOPS device.

sigsetjmp/siglongjmp are just ~60 instructions, it cannot account for
180ns (600 clock cycles).  The cost of creating and destroying the
coroutine must come from somewhere else.

Let's just try something else.  Let's remove the pool mutex, as suggested
by Peter but in a way that works even with non-ioeventfd backends.

I still believe we will end with some kind of coroutine bypass scheme
(even coroutines _do_ allocate an AIOCB, so calling bdrv_aio_readv
directly can help), but hey it cannot hurt to optimize hot code.

The patch below has a single pool where coroutines are placed on
destruction, and a per-thread allocation pool.  Whenever the destruction
pool is big enough, the next thread that runs out of coroutines will
steal from it instead of making a new coroutine.  If this works, it
would be beautiful in two ways:

1) the allocation does not have to do any atomic operation in the fast
path, it's entirely using thread-local storage.  Once every POOL_BATCH_SIZE
allocations it will do a single atomic_xchg.  Release does an atomic_cmpxchg
loop, that hopefully doesn't cause any starvation, and an atomic_inc.

2) in theory this should be completely adaptive.  The number of coroutines
around should be a little more than POOL_BATCH_SIZE * number of allocating
threads; so this also removes qemu_coroutine_adjust_pool_size.  (The previous
pool size was POOL_BATCH_SIZE * number of block backends, so it was a bit
more generous).

The patch below is very raw, and untested beyond tests/test-coroutine.
There may be some premature optimization (not using GPrivate, even though
I need it to run the per-thread destructor) but it was easy enough.  Ming,
Kevin, can you benchmark it?

Related to this, we can see if __thread beats GPrivate in coroutine-ucontext.c.
Every clock cycle counts (600 clock cycles are a 2% improvement at 3 GHz
and 100 kiops) so we can see what we get.

Paolo
diff mbox

Patch

diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index d433b90..6a01e2f 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -191,6 +191,17 @@  struct {                                                                \
 #define QSLIST_INSERT_HEAD(head, elm, field) do {                        \
         (elm)->field.sle_next = (head)->slh_first;                      \
         (head)->slh_first = (elm);                                      \
+} while (/*CONSTCOND*/0)
+
+#define QSLIST_INSERT_HEAD_ATOMIC(head, elm, field) do {                   \
+	do {                                                               \
+	   (elm)->field.sle_next = (head)->slh_first;                      \
+	} while (atomic_cmpxchg(&(head)->slh_first, (elm)->field.sle_next, \
+                              (elm)) != (elm)->field.sle_next);            \
+} while (/*CONSTCOND*/0)
+
+#define QSLIST_MOVE_ATOMIC(dest, src) do {                               \
+	(dest)->slh_first = atomic_xchg(&(src)->slh_first, NULL);        \
 } while (/*CONSTCOND*/0)
 
 #define QSLIST_REMOVE_HEAD(head, field) do {                             \
diff --git a/qemu-coroutine.c b/qemu-coroutine.c
index bd574aa..60d761f 100644
--- a/qemu-coroutine.c
+++ b/qemu-coroutine.c
@@ -15,35 +15,52 @@ 
 #include "trace.h"
 #include "qemu-common.h"
 #include "qemu/thread.h"
+#include "qemu/atomic.h"
 #include "block/coroutine.h"
 #include "block/coroutine_int.h"
 
 enum {
-    POOL_DEFAULT_SIZE = 64,
+    POOL_BATCH_SIZE = 64,
 };
 
 /** Free list to speed up creation */
-static QemuMutex pool_lock;
-static QSLIST_HEAD(, Coroutine) pool = QSLIST_HEAD_INITIALIZER(pool);
+static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
 static unsigned int pool_size;
-static unsigned int pool_max_size = POOL_DEFAULT_SIZE;
+static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
+
+/* The GPrivate is only used to invoke coroutine_pool_cleanup.  */
+static void coroutine_pool_cleanup(void *value);
+static GPrivate dummy_key = G_PRIVATE_INIT(coroutine_pool_cleanup);
 
 Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
 {
     Coroutine *co = NULL;
 
     if (CONFIG_COROUTINE_POOL) {
-        qemu_mutex_lock(&pool_lock);
-        co = QSLIST_FIRST(&pool);
+        co = QSLIST_FIRST(&alloc_pool);
+        if (!co) {
+            if (pool_size > POOL_BATCH_SIZE) {
+                /* This is not exact; there could be a little skew between pool_size
+                 * and the actual size of alloc_pool.  But it is just a heuristic,
+                 * it does not need to be perfect.
+                 */
+                pool_size = 0;
+                QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
+                co = QSLIST_FIRST(&alloc_pool);
+
+                /* Use this slow path as an easy place to make the per-thread vale
+                 * non-NULL and thus trigger the destructor.
+                 */
+                g_private_set(&dummy_key, &dummy_key);
+            }
+        }
         if (co) {
-            QSLIST_REMOVE_HEAD(&pool, pool_next);
-            pool_size--;
+            QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
         }
-        qemu_mutex_unlock(&pool_lock);
     }
 
     if (!co) {
         co = qemu_coroutine_new();
     }
 
     co->entry = entry;
@@ -54,36 +71,26 @@  Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
 static void coroutine_delete(Coroutine *co)
 {
     if (CONFIG_COROUTINE_POOL) {
-        qemu_mutex_lock(&pool_lock);
-        if (pool_size < pool_max_size) {
-            QSLIST_INSERT_HEAD(&pool, co, pool_next);
+        if (pool_size < POOL_BATCH_SIZE * 2) {
             co->caller = NULL;
-            pool_size++;
-            qemu_mutex_unlock(&pool_lock);
+            QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
+            atomic_inc(&pool_size);
             return;
         }
-        qemu_mutex_unlock(&pool_lock);
     }
 
     qemu_coroutine_delete(co);
 }
 
-static void __attribute__((constructor)) coroutine_pool_init(void)
-{
-    qemu_mutex_init(&pool_lock);
-}
-
-static void __attribute__((destructor)) coroutine_pool_cleanup(void)
+static void coroutine_pool_cleanup(void *value)
 {
     Coroutine *co;
     Coroutine *tmp;
 
-    QSLIST_FOREACH_SAFE(co, &pool, pool_next, tmp) {
-        QSLIST_REMOVE_HEAD(&pool, pool_next);
+    QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
+        QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
         qemu_coroutine_delete(co);
     }
-
-    qemu_mutex_destroy(&pool_lock);
 }
 
 static void coroutine_swap(Coroutine *from, Coroutine *to)
@@ -140,20 +156,4 @@  void coroutine_fn qemu_coroutine_yield(void)
 
 void qemu_coroutine_adjust_pool_size(int n)
 {
-    qemu_mutex_lock(&pool_lock);
-
-    pool_max_size += n;
-
-    /* Callers should never take away more than they added */
-    assert(pool_max_size >= POOL_DEFAULT_SIZE);
-
-    /* Trim oversized pool down to new max */
-    while (pool_size > pool_max_size) {
-        Coroutine *co = QSLIST_FIRST(&pool);
-        QSLIST_REMOVE_HEAD(&pool, pool_next);
-        pool_size--;
-        qemu_coroutine_delete(co);
-    }
-
-    qemu_mutex_unlock(&pool_lock);
 }