diff mbox

[1/7] coroutine-ucontext: use __thread

Message ID 1417183941-26329-2-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Nov. 28, 2014, 2:12 p.m. UTC
ELF thread local storage is about 10% faster on tests/test-coroutine's
perf/cost test.  The timing on my machine is 160ns per iteration with
pthread TLS, 145 with ELF TLS.

Based on a patch by Kevin Wolf and Peter Lieven, but redone to follow
the model of coroutine-win32.c (including the important "noinline"
attribute!!!).

Platforms without thread-local storage (OpenBSD probably?) will need
a new-enough GCC for this to compile, in order to use the same emutls
support that Windows already relies on.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 coroutine-ucontext.c | 64 +++++++++++++---------------------------------------
 1 file changed, 16 insertions(+), 48 deletions(-)

Comments

Peter Maydell Nov. 28, 2014, 2:28 p.m. UTC | #1
On 28 November 2014 at 14:12, Paolo Bonzini <pbonzini@redhat.com> wrote:
> +/* This function is marked noinline to prevent GCC from inlining it
> + * into coroutine_trampoline(). If we allow it to do that then it
> + * hoists the code to get the address of the TLS variable "current"
> + * out of the while() loop. This is an invalid transformation because
> + * the SwitchToFiber() call may be called when running thread A but
> + * return in thread B, and so we might be in a different thread
> + * context each time round the loop.
> + */
>  CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
>                                        CoroutineAction action)

??? You've added the comment but the function is not marked
"noinline" at all...

-- PMM
Markus Armbruster Nov. 28, 2014, 2:45 p.m. UTC | #2
Paolo Bonzini <pbonzini@redhat.com> writes:

> ELF thread local storage is about 10% faster on tests/test-coroutine's
> perf/cost test.  The timing on my machine is 160ns per iteration with
> pthread TLS, 145 with ELF TLS.
>
> Based on a patch by Kevin Wolf and Peter Lieven, but redone to follow
> the model of coroutine-win32.c (including the important "noinline"
> attribute!!!).
>
> Platforms without thread-local storage (OpenBSD probably?) will need
> a new-enough GCC for this to compile, in order to use the same emutls
> support that Windows already relies on.
[...]
> @@ -193,15 +155,22 @@ void qemu_coroutine_delete(Coroutine *co_)
>      g_free(co);
>  }
>  
> +/* This function is marked noinline to prevent GCC from inlining it
> + * into coroutine_trampoline(). If we allow it to do that then it
> + * hoists the code to get the address of the TLS variable "current"
> + * out of the while() loop. This is an invalid transformation because
> + * the SwitchToFiber() call may be called when running thread A but
> + * return in thread B, and so we might be in a different thread
> + * context each time round the loop.
> + */
>  CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
>                                        CoroutineAction action)

Err, did you forget the actual __attribute__((noinline))?

>  {
>      CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
>      CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
> -    CoroutineThreadState *s = coroutine_get_thread_state();
>      int ret;
>  
> -    s->current = to_;
> +    current = to_;
>  
>      ret = sigsetjmp(from->env, 0);
>      if (ret == 0) {
[...]
Kevin Wolf Nov. 28, 2014, 3:36 p.m. UTC | #3
Am 28.11.2014 um 15:45 hat Markus Armbruster geschrieben:
> Paolo Bonzini <pbonzini@redhat.com> writes:
> 
> > ELF thread local storage is about 10% faster on tests/test-coroutine's
> > perf/cost test.  The timing on my machine is 160ns per iteration with
> > pthread TLS, 145 with ELF TLS.
> >
> > Based on a patch by Kevin Wolf and Peter Lieven, but redone to follow
> > the model of coroutine-win32.c (including the important "noinline"
> > attribute!!!).
> >
> > Platforms without thread-local storage (OpenBSD probably?) will need
> > a new-enough GCC for this to compile, in order to use the same emutls
> > support that Windows already relies on.
> [...]
> > @@ -193,15 +155,22 @@ void qemu_coroutine_delete(Coroutine *co_)
> >      g_free(co);
> >  }
> >  
> > +/* This function is marked noinline to prevent GCC from inlining it
> > + * into coroutine_trampoline(). If we allow it to do that then it
> > + * hoists the code to get the address of the TLS variable "current"
> > + * out of the while() loop. This is an invalid transformation because
> > + * the SwitchToFiber() call may be called when running thread A but
> > + * return in thread B, and so we might be in a different thread
> > + * context each time round the loop.
> > + */
> >  CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
> >                                        CoroutineAction action)
> 
> Err, did you forget the actual __attribute__((noinline))?

The comment needs updating, too. There's no SwitchToFiber() in the
ucontext implementation.

Kevin
diff mbox

Patch

diff --git a/coroutine-ucontext.c b/coroutine-ucontext.c
index 4bf2cde..d86e3e1 100644
--- a/coroutine-ucontext.c
+++ b/coroutine-ucontext.c
@@ -25,7 +25,6 @@ 
 #include <stdlib.h>
 #include <setjmp.h>
 #include <stdint.h>
-#include <pthread.h>
 #include <ucontext.h>
 #include "qemu-common.h"
 #include "block/coroutine_int.h"
@@ -48,15 +47,8 @@  typedef struct {
 /**
  * Per-thread coroutine bookkeeping
  */
-typedef struct {
-    /** Currently executing coroutine */
-    Coroutine *current;
-
-    /** The default coroutine */
-    CoroutineUContext leader;
-} CoroutineThreadState;
-
-static pthread_key_t thread_state_key;
+static __thread CoroutineUContext leader;
+static __thread Coroutine *current;
 
 /*
  * va_args to makecontext() must be type 'int', so passing
@@ -68,36 +60,6 @@  union cc_arg {
     int i[2];
 };
 
-static CoroutineThreadState *coroutine_get_thread_state(void)
-{
-    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
-
-    if (!s) {
-        s = g_malloc0(sizeof(*s));
-        s->current = &s->leader.base;
-        pthread_setspecific(thread_state_key, s);
-    }
-    return s;
-}
-
-static void qemu_coroutine_thread_cleanup(void *opaque)
-{
-    CoroutineThreadState *s = opaque;
-
-    g_free(s);
-}
-
-static void __attribute__((constructor)) coroutine_init(void)
-{
-    int ret;
-
-    ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup);
-    if (ret != 0) {
-        fprintf(stderr, "unable to create leader key: %s\n", strerror(errno));
-        abort();
-    }
-}
-
 static void coroutine_trampoline(int i0, int i1)
 {
     union cc_arg arg;
@@ -193,15 +155,22 @@  void qemu_coroutine_delete(Coroutine *co_)
     g_free(co);
 }
 
+/* This function is marked noinline to prevent GCC from inlining it
+ * into coroutine_trampoline(). If we allow it to do that then it
+ * hoists the code to get the address of the TLS variable "current"
+ * out of the while() loop. This is an invalid transformation because
+ * the SwitchToFiber() call may be called when running thread A but
+ * return in thread B, and so we might be in a different thread
+ * context each time round the loop.
+ */
 CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
                                       CoroutineAction action)
 {
     CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
     CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
-    CoroutineThreadState *s = coroutine_get_thread_state();
     int ret;
 
-    s->current = to_;
+    current = to_;
 
     ret = sigsetjmp(from->env, 0);
     if (ret == 0) {
@@ -212,14 +181,13 @@  CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
 
 Coroutine *qemu_coroutine_self(void)
 {
-    CoroutineThreadState *s = coroutine_get_thread_state();
-
-    return s->current;
+    if (!current) {
+        current = &leader.base;
+    }
+    return current;
 }
 
 bool qemu_in_coroutine(void)
 {
-    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
-
-    return s && s->current->caller;
+    return current && current->caller;
 }