diff mbox series

[3/5] coroutines: abort if we try to enter a still-sleeping coroutine

Message ID 0c039d00e03331d863ee249810d9778313670803.1511145863.git.jcody@redhat.com
State New
Headers show
Series None | expand

Commit Message

Jeff Cody Nov. 20, 2017, 2:46 a.m. UTC
Once a coroutine is "sleeping", the timer callback will either enter the
coroutine, or schedule it for the next AioContext if using iothreads.

It is illegal to enter that coroutine while waiting for this timer
event and subsequent callback.  This patch will catch such an attempt,
and abort QEMU with an error.

Like with the previous patch, we cannot rely solely on the co->caller
check for recursive entry.  The prematurely entered coroutine may exit
with COROUTINE_TERMINATE before the timer expires, making co->caller no
longer valid.

We can clear co->sleeping in in co_sleep_cb(), because any doubly entry
attempt after point should be caught by either the co->scheduled or
co->caller checks.

Signed-off-by: Jeff Cody <jcody@redhat.com>
---
 include/qemu/coroutine_int.h | 2 ++
 util/qemu-coroutine-sleep.c  | 3 +++
 util/qemu-coroutine.c        | 5 +++++
 3 files changed, 10 insertions(+)

Comments

Stefan Hajnoczi Nov. 20, 2017, 11:43 a.m. UTC | #1
On Sun, Nov 19, 2017 at 09:46:44PM -0500, Jeff Cody wrote:
> diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
> index 931cdc9..b071217 100644
> --- a/include/qemu/coroutine_int.h
> +++ b/include/qemu/coroutine_int.h
> @@ -56,6 +56,8 @@ struct Coroutine {
>  
>      int scheduled;
>  
> +    int sleeping;

s/int/bool/

BTW an alternative to adding individual bools is to implement a finite
state machine for the entire coroutine lifecycle.  A single function can
validate all state transitions:

  void check_state_transition(CoState old, CoState new,
                              const char *action)
  {
      const char *errmsg = fsm[old][new];
      if (!errmsg) {
          return; /* valid transition! */
      }

      fprintf(stderr, "Cannot %s coroutine from %s state\n",
              action, state_name[old]);
      abort();
  }

Specifying fsm[][] forces us to think through all possible state
transitions.  This approach is proactive whereas adding bool flags is
reactive since it only covers a subset of states that were encountered
after crashes.  I'm not sure if it's worth it though :).
Jeff Cody Nov. 20, 2017, 1:45 p.m. UTC | #2
On Mon, Nov 20, 2017 at 11:43:34AM +0000, Stefan Hajnoczi wrote:
> On Sun, Nov 19, 2017 at 09:46:44PM -0500, Jeff Cody wrote:
> > diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
> > index 931cdc9..b071217 100644
> > --- a/include/qemu/coroutine_int.h
> > +++ b/include/qemu/coroutine_int.h
> > @@ -56,6 +56,8 @@ struct Coroutine {
> >  
> >      int scheduled;
> >  
> > +    int sleeping;
> 
> s/int/bool/
> 

OK.

> BTW an alternative to adding individual bools is to implement a finite
> state machine for the entire coroutine lifecycle.  A single function can
> validate all state transitions:
> 
>   void check_state_transition(CoState old, CoState new,
>                               const char *action)
>   {
>       const char *errmsg = fsm[old][new];
>       if (!errmsg) {
>           return; /* valid transition! */
>       }
> 
>       fprintf(stderr, "Cannot %s coroutine from %s state\n",
>               action, state_name[old]);
>       abort();
>   }
> 
> Specifying fsm[][] forces us to think through all possible state
> transitions.  This approach is proactive whereas adding bool flags is
> reactive since it only covers a subset of states that were encountered
> after crashes.  I'm not sure if it's worth it though :).

Interesting idea; maybe more for 2.12 instead of 2.11, though?

Jeff
Paolo Bonzini Nov. 20, 2017, 10:30 p.m. UTC | #3
On 20/11/2017 03:46, Jeff Cody wrote:
> Once a coroutine is "sleeping", the timer callback will either enter the
> coroutine, or schedule it for the next AioContext if using iothreads.
> 
> It is illegal to enter that coroutine while waiting for this timer
> event and subsequent callback.  This patch will catch such an attempt,
> and abort QEMU with an error.
> 
> Like with the previous patch, we cannot rely solely on the co->caller
> check for recursive entry.  The prematurely entered coroutine may exit
> with COROUTINE_TERMINATE before the timer expires, making co->caller no
> longer valid.
> 
> We can clear co->sleeping in in co_sleep_cb(), because any doubly entry
> attempt after point should be caught by either the co->scheduled or
> co->caller checks.
> 
> Signed-off-by: Jeff Cody <jcody@redhat.com>
> ---
>  include/qemu/coroutine_int.h | 2 ++
>  util/qemu-coroutine-sleep.c  | 3 +++
>  util/qemu-coroutine.c        | 5 +++++
>  3 files changed, 10 insertions(+)
> 
> diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
> index 931cdc9..b071217 100644
> --- a/include/qemu/coroutine_int.h
> +++ b/include/qemu/coroutine_int.h
> @@ -56,6 +56,8 @@ struct Coroutine {
>  
>      int scheduled;
>  
> +    int sleeping;

Is this a different "state" (in Stefan's parlance) than scheduled?  In
practice both means that someone may call qemu_(aio_)coroutine_enter
concurrently, so you'd better not do it yourself.

Paolo

> +
>      QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
>      QSLIST_ENTRY(Coroutine) co_scheduled_next;
>  };
> diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
> index 9c56550..11ae95a 100644
> --- a/util/qemu-coroutine-sleep.c
> +++ b/util/qemu-coroutine-sleep.c
> @@ -13,6 +13,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/coroutine.h"
> +#include "qemu/coroutine_int.h"
>  #include "qemu/timer.h"
>  #include "block/aio.h"
>  
> @@ -25,6 +26,7 @@ static void co_sleep_cb(void *opaque)
>  {
>      CoSleepCB *sleep_cb = opaque;
>  
> +    sleep_cb->co->sleeping = 0;
>      aio_co_wake(sleep_cb->co);
>  }
>  
> @@ -34,6 +36,7 @@ void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
>      CoSleepCB sleep_cb = {
>          .co = qemu_coroutine_self(),
>      };
> +    sleep_cb.co->sleeping = 1;
>      sleep_cb.ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &sleep_cb);
>      timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns);
>      qemu_coroutine_yield();
> diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
> index 2edab63..1d9f93d 100644
> --- a/util/qemu-coroutine.c
> +++ b/util/qemu-coroutine.c
> @@ -118,6 +118,11 @@ void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
>          abort();
>      }
>  
> +    if (co->sleeping == 1) {
> +        fprintf(stderr, "Cannot enter a co-routine that is still sleeping\n");
> +        abort();
> +    }
> +
>      if (co->caller) {
>          fprintf(stderr, "Co-routine re-entered recursively\n");
>          abort();
>
Jeff Cody Nov. 20, 2017, 10:35 p.m. UTC | #4
On Mon, Nov 20, 2017 at 11:30:39PM +0100, Paolo Bonzini wrote:
> On 20/11/2017 03:46, Jeff Cody wrote:
> > Once a coroutine is "sleeping", the timer callback will either enter the
> > coroutine, or schedule it for the next AioContext if using iothreads.
> > 
> > It is illegal to enter that coroutine while waiting for this timer
> > event and subsequent callback.  This patch will catch such an attempt,
> > and abort QEMU with an error.
> > 
> > Like with the previous patch, we cannot rely solely on the co->caller
> > check for recursive entry.  The prematurely entered coroutine may exit
> > with COROUTINE_TERMINATE before the timer expires, making co->caller no
> > longer valid.
> > 
> > We can clear co->sleeping in in co_sleep_cb(), because any doubly entry
> > attempt after point should be caught by either the co->scheduled or
> > co->caller checks.
> > 
> > Signed-off-by: Jeff Cody <jcody@redhat.com>
> > ---
> >  include/qemu/coroutine_int.h | 2 ++
> >  util/qemu-coroutine-sleep.c  | 3 +++
> >  util/qemu-coroutine.c        | 5 +++++
> >  3 files changed, 10 insertions(+)
> > 
> > diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
> > index 931cdc9..b071217 100644
> > --- a/include/qemu/coroutine_int.h
> > +++ b/include/qemu/coroutine_int.h
> > @@ -56,6 +56,8 @@ struct Coroutine {
> >  
> >      int scheduled;
> >  
> > +    int sleeping;
> 
> Is this a different "state" (in Stefan's parlance) than scheduled?  In
> practice both means that someone may call qemu_(aio_)coroutine_enter
> concurrently, so you'd better not do it yourself.
> 

It is slightly different; it is from sleeping with a timer via
co_aio_sleep_ns and waking via co_sleep_cb.  Whereas the 'co->scheduled' is
specifically from being scheduled for a specific AioContext, via
aio_co_schedule().

In practice, 'co->schedule' and 'co->sleeping' certainly rhyme, at the very
least.

But having them separate will put the abort closer to where the problem lies,
so it should make debugging a bit easier if we hit it.

> 
> > +
> >      QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
> >      QSLIST_ENTRY(Coroutine) co_scheduled_next;
> >  };
> > diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
> > index 9c56550..11ae95a 100644
> > --- a/util/qemu-coroutine-sleep.c
> > +++ b/util/qemu-coroutine-sleep.c
> > @@ -13,6 +13,7 @@
> >  
> >  #include "qemu/osdep.h"
> >  #include "qemu/coroutine.h"
> > +#include "qemu/coroutine_int.h"
> >  #include "qemu/timer.h"
> >  #include "block/aio.h"
> >  
> > @@ -25,6 +26,7 @@ static void co_sleep_cb(void *opaque)
> >  {
> >      CoSleepCB *sleep_cb = opaque;
> >  
> > +    sleep_cb->co->sleeping = 0;
> >      aio_co_wake(sleep_cb->co);
> >  }
> >  
> > @@ -34,6 +36,7 @@ void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
> >      CoSleepCB sleep_cb = {
> >          .co = qemu_coroutine_self(),
> >      };
> > +    sleep_cb.co->sleeping = 1;
> >      sleep_cb.ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &sleep_cb);
> >      timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns);
> >      qemu_coroutine_yield();
> > diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
> > index 2edab63..1d9f93d 100644
> > --- a/util/qemu-coroutine.c
> > +++ b/util/qemu-coroutine.c
> > @@ -118,6 +118,11 @@ void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
> >          abort();
> >      }
> >  
> > +    if (co->sleeping == 1) {
> > +        fprintf(stderr, "Cannot enter a co-routine that is still sleeping\n");
> > +        abort();
> > +    }
> > +
> >      if (co->caller) {
> >          fprintf(stderr, "Co-routine re-entered recursively\n");
> >          abort();
> > 
>
Paolo Bonzini Nov. 20, 2017, 10:47 p.m. UTC | #5
On 20/11/2017 23:35, Jeff Cody wrote:
>> Is this a different "state" (in Stefan's parlance) than scheduled?  In
>> practice both means that someone may call qemu_(aio_)coroutine_enter
>> concurrently, so you'd better not do it yourself.
>>
> It is slightly different; it is from sleeping with a timer via
> co_aio_sleep_ns and waking via co_sleep_cb.  Whereas the 'co->scheduled' is
> specifically from being scheduled for a specific AioContext, via
> aio_co_schedule().

Right; however, that would only make a difference if we allowed
canceling a co_aio_sleep_ns.  Since we don't want that, they have the
same transitions.

> In practice, 'co->schedule' and 'co->sleeping' certainly rhyme, at the very
> least.
> 
> But having them separate will put the abort closer to where the problem lies,
> so it should make debugging a bit easier if we hit it.

What do you mean by closer?  It would print a slightly more informative
message, but the message is in qemu_aio_coroutine_for both cases.

In fact, unifying co->scheduled and co->sleeping means that you can
easily abort when co_aio_sleep_ns is called on a scheduled coroutine, like

    /* This is valid. */
    aio_co_schedule(qemu_get_current_aio_context(),
                    qemu_coroutine_self());

    /* But only if there was a qemu_coroutine_yield here.  */
    co_aio_sleep_ns(qemu_get_current_aio_context(), 1000);

Thanks,

Paolo
Jeff Cody Nov. 20, 2017, 11:08 p.m. UTC | #6
On Mon, Nov 20, 2017 at 11:47:09PM +0100, Paolo Bonzini wrote:
> On 20/11/2017 23:35, Jeff Cody wrote:
> >> Is this a different "state" (in Stefan's parlance) than scheduled?  In
> >> practice both means that someone may call qemu_(aio_)coroutine_enter
> >> concurrently, so you'd better not do it yourself.
> >>
> > It is slightly different; it is from sleeping with a timer via
> > co_aio_sleep_ns and waking via co_sleep_cb.  Whereas the 'co->scheduled' is
> > specifically from being scheduled for a specific AioContext, via
> > aio_co_schedule().
> 
> Right; however, that would only make a difference if we allowed
> canceling a co_aio_sleep_ns.  Since we don't want that, they have the
> same transitions.
> 
> > In practice, 'co->schedule' and 'co->sleeping' certainly rhyme, at the very
> > least.
> > 
> > But having them separate will put the abort closer to where the problem lies,
> > so it should make debugging a bit easier if we hit it.
> 
> What do you mean by closer?  It would print a slightly more informative
> message, but the message is in qemu_aio_coroutine_for both cases.
> 

Sorry, sloppy wording; I meant what you said above, that the error message
is more informative, so by tracking down where co->sleeping is set the
developer is closer to where the problem lies.

> In fact, unifying co->scheduled and co->sleeping means that you can
> easily abort when co_aio_sleep_ns is called on a scheduled coroutine, like
> 
>     /* This is valid. */
>     aio_co_schedule(qemu_get_current_aio_context(),
>                     qemu_coroutine_self());
> 
>     /* But only if there was a qemu_coroutine_yield here.  */
>     co_aio_sleep_ns(qemu_get_current_aio_context(), 1000);
>

That is true.  But we could also check (co->sleeping || co->scheduled) in
co_aio_sleep_ns() though, as well.

Hmm... not checking co->sleeping in co_aio_sleep_ns() is a bug in my
patch.  We don't want to schedule a coroutine on two different timers,
either.

So what do you think about adding this to the patch:

@@ -34,6 +36,7 @@ void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
     CoSleepCB sleep_cb = {
         .co = qemu_coroutine_self(),
     };
+    if (sleep_cb.co->sleeping == 1 || sleep_cb.co->scheduled == 1) {
+       fprintf(stderr, "Cannot sleep a co-routine that is already sleeping "
+                       " or scheduled\n");
+       abort();
+    }
+    sleep_cb.co->sleeping = 1;
     sleep_cb.ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &sleep_cb);
     timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns);
     qemu_coroutine_yield();


Jeff
Paolo Bonzini Nov. 20, 2017, 11:13 p.m. UTC | #7
On 21/11/2017 00:08, Jeff Cody wrote:
> @@ -34,6 +36,7 @@ void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
>      CoSleepCB sleep_cb = {
>          .co = qemu_coroutine_self(),
>      };
> +    if (sleep_cb.co->sleeping == 1 || sleep_cb.co->scheduled == 1) {
> +       fprintf(stderr, "Cannot sleep a co-routine that is already sleeping "
> +                       " or scheduled\n");
> +       abort();
> +    }
> +    sleep_cb.co->sleeping = 1;
>      sleep_cb.ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &sleep_cb);
>      timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns);
>      qemu_coroutine_yield();

I understand that this was just an example and not the actual patch, but
I'll still point out that this loses the benefit (better error message)
of keeping the flags separate.

What do you think about making "scheduled" a const char * and assigning
__func__ to it (i.e. either "aio_co_schedule" or "co_aio_sleep_ns")?

Thanks,

Paolo
Jeff Cody Nov. 20, 2017, 11:31 p.m. UTC | #8
On Tue, Nov 21, 2017 at 12:13:46AM +0100, Paolo Bonzini wrote:
> On 21/11/2017 00:08, Jeff Cody wrote:
> > @@ -34,6 +36,7 @@ void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
> >      CoSleepCB sleep_cb = {
> >          .co = qemu_coroutine_self(),
> >      };
> > +    if (sleep_cb.co->sleeping == 1 || sleep_cb.co->scheduled == 1) {
> > +       fprintf(stderr, "Cannot sleep a co-routine that is already sleeping "
> > +                       " or scheduled\n");
> > +       abort();
> > +    }
> > +    sleep_cb.co->sleeping = 1;
> >      sleep_cb.ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &sleep_cb);
> >      timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns);
> >      qemu_coroutine_yield();
> 
> I understand that this was just an example and not the actual patch, but
> I'll still point out that this loses the benefit (better error message)
> of keeping the flags separate.
> 
> What do you think about making "scheduled" a const char * and assigning
> __func__ to it (i.e. either "aio_co_schedule" or "co_aio_sleep_ns")?
> 

Ohhh, nice.  I'll spin a v2 with that, and merge patches 3 and 5 together.
And then maybe for 2.12 we can look at making it a fsm, like Stefan
suggested (or somehow make coroutine entry thread safe and idempotent).

Jeff
Stefan Hajnoczi Nov. 21, 2017, 10:17 a.m. UTC | #9
On Mon, Nov 20, 2017 at 08:45:21AM -0500, Jeff Cody wrote:
> On Mon, Nov 20, 2017 at 11:43:34AM +0000, Stefan Hajnoczi wrote:
> > On Sun, Nov 19, 2017 at 09:46:44PM -0500, Jeff Cody wrote:
> > BTW an alternative to adding individual bools is to implement a finite
> > state machine for the entire coroutine lifecycle.  A single function can
> > validate all state transitions:
> > 
> >   void check_state_transition(CoState old, CoState new,
> >                               const char *action)
> >   {
> >       const char *errmsg = fsm[old][new];
> >       if (!errmsg) {
> >           return; /* valid transition! */
> >       }
> > 
> >       fprintf(stderr, "Cannot %s coroutine from %s state\n",
> >               action, state_name[old]);
> >       abort();
> >   }
> > 
> > Specifying fsm[][] forces us to think through all possible state
> > transitions.  This approach is proactive whereas adding bool flags is
> > reactive since it only covers a subset of states that were encountered
> > after crashes.  I'm not sure if it's worth it though :).
> 
> Interesting idea; maybe more for 2.12 instead of 2.11, though?

Sure.

Stefan
diff mbox series

Patch

diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index 931cdc9..b071217 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -56,6 +56,8 @@  struct Coroutine {
 
     int scheduled;
 
+    int sleeping;
+
     QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
     QSLIST_ENTRY(Coroutine) co_scheduled_next;
 };
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
index 9c56550..11ae95a 100644
--- a/util/qemu-coroutine-sleep.c
+++ b/util/qemu-coroutine-sleep.c
@@ -13,6 +13,7 @@ 
 
 #include "qemu/osdep.h"
 #include "qemu/coroutine.h"
+#include "qemu/coroutine_int.h"
 #include "qemu/timer.h"
 #include "block/aio.h"
 
@@ -25,6 +26,7 @@  static void co_sleep_cb(void *opaque)
 {
     CoSleepCB *sleep_cb = opaque;
 
+    sleep_cb->co->sleeping = 0;
     aio_co_wake(sleep_cb->co);
 }
 
@@ -34,6 +36,7 @@  void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
     CoSleepCB sleep_cb = {
         .co = qemu_coroutine_self(),
     };
+    sleep_cb.co->sleeping = 1;
     sleep_cb.ts = aio_timer_new(ctx, type, SCALE_NS, co_sleep_cb, &sleep_cb);
     timer_mod(sleep_cb.ts, qemu_clock_get_ns(type) + ns);
     qemu_coroutine_yield();
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index 2edab63..1d9f93d 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -118,6 +118,11 @@  void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
         abort();
     }
 
+    if (co->sleeping == 1) {
+        fprintf(stderr, "Cannot enter a co-routine that is still sleeping\n");
+        abort();
+    }
+
     if (co->caller) {
         fprintf(stderr, "Co-routine re-entered recursively\n");
         abort();