Patchwork [11/19] use a bottom half to run timers

login
register
mail settings
Submitter Paolo Bonzini
Date Dec. 21, 2009, 8:09 a.m.
Message ID <1261382970-23251-12-git-send-email-pbonzini@redhat.com>
Download mbox | patch
Permalink /patch/41528/
State New
Headers show

Comments

Paolo Bonzini - Dec. 21, 2009, 8:09 a.m.
Make the timer subsystem register its own bottom half instead of
placing the bottom half code in the heart of the main loop.  To
test if an alarm timer is pending, just check if the bottom half is
scheduled.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 vl.c |   68 ++++++++++++++++++++++++++++++++++++-----------------------------
 1 files changed, 38 insertions(+), 30 deletions(-)
Marcelo Tosatti - Dec. 23, 2009, 6:37 p.m.
On Mon, Dec 21, 2009 at 09:09:22AM +0100, Paolo Bonzini wrote:
> Make the timer subsystem register its own bottom half instead of
> placing the bottom half code in the heart of the main loop.  To
> test if an alarm timer is pending, just check if the bottom half is
> scheduled.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  vl.c |   68 ++++++++++++++++++++++++++++++++++++-----------------------------
>  1 files changed, 38 insertions(+), 30 deletions(-)
> 
> diff --git a/vl.c b/vl.c
> index 78807f5..289aadc 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -573,10 +573,17 @@ struct qemu_alarm_timer {
>      void (*rearm)(struct qemu_alarm_timer *t);
>      void *priv;
>  
> +    QEMUBH *bh;
>      char expired;
> -    char pending;
>  };
>  
> +static struct qemu_alarm_timer *alarm_timer;
> +
> +static inline int qemu_alarm_pending(void)
> +{
> +    return qemu_bh_scheduled(alarm_timer->bh);
> +}
> +
>  static inline int alarm_has_dynticks(struct qemu_alarm_timer *t)
>  {
>      return !!t->rearm;
> @@ -593,8 +600,6 @@ static void qemu_rearm_alarm_timer(struct qemu_alarm_timer *t)
>  /* TODO: MIN_TIMER_REARM_US should be optimized */
>  #define MIN_TIMER_REARM_US 250
>  
> -static struct qemu_alarm_timer *alarm_timer;
> -
>  #ifdef _WIN32
>  
>  struct qemu_alarm_win32 {
> @@ -874,7 +879,7 @@ void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)
>  
>      /* Rearm if necessary  */
>      if (pt == &active_timers[ts->clock->type]) {
> -        if (!alarm_timer->pending) {
> +        if (!qemu_alarm_pending()) {
>              qemu_rearm_alarm_timer(alarm_timer);
>          }
>          /* Interrupt execution to force deadline recalculation.  */
> @@ -1001,6 +1006,31 @@ static const VMStateDescription vmstate_timers = {
>  
>  static void qemu_event_increment(void);
>  
> +static void qemu_timer_bh(void *opaque)
> +{
> +    struct qemu_alarm_timer *t = opaque;
> +
> +    /* rearm timer, if not periodic */
> +    if (t->expired) {
> +        t->expired = 0;
> +        qemu_rearm_alarm_timer(t);
> +    }
> +
> +    /* vm time timers */
> +    if (vm_running) {
> +        if (!cur_cpu || likely(!(cur_cpu->singlestep_enabled & SSTEP_NOTIMER)))
> +            qemu_run_timers(&active_timers[QEMU_CLOCK_VIRTUAL],
> +                            qemu_get_clock(vm_clock));
> +    }
> +
> +    /* real time timers */
> +    qemu_run_timers(&active_timers[QEMU_CLOCK_REALTIME],
> +                    qemu_get_clock(rt_clock));
> +
> +    qemu_run_timers(&active_timers[QEMU_CLOCK_HOST],
> +                    qemu_get_clock(host_clock));
> +}
> +
>  #ifdef _WIN32
>  static void CALLBACK host_alarm_handler(UINT uTimerID, UINT uMsg,
>                                          DWORD_PTR dwUser, DWORD_PTR dw1,
> @@ -1059,8 +1089,7 @@ static void host_alarm_handler(int host_signum)
>              cpu_exit(next_cpu);
>          }
>  #endif
> -        t->pending = 1;
> -        qemu_notify_event();
> +        qemu_bh_schedule(t->bh);
>      }
>  }
>  
> @@ -1446,7 +1475,8 @@ static int init_timer_alarm(void)
>      }
>  
>      /* first event is at time 0 */
> -    t->pending = 1;
> +    t->bh = qemu_bh_new(qemu_timer_bh, t);
> +    qemu_bh_schedule(t->bh);
>      alarm_timer = t;

You should probably make sure the bh handling is signal safe. Perhaps
use atomic test-and-set for bh->schedule on qemu_bh_poll, etc...

Also there's a similar problem with the expired flag

> +    /* rearm timer, if not periodic */
> +    if (t->expired) {
> +        t->expired = 0;
> +        qemu_rearm_alarm_timer(t);
> +    }

(it was there before your patch).

BTW, if host_alarm_handler runs after is t->expired is cleared,
but before qemu_run_timers->callback->qemu_mod_timer, subsequent
qemu_mod_timer callbacks fail to rearm the alarm timer, in case
timer in question expires earlier?

Nice patchset!

>      qemu_add_vm_change_state_handler(alarm_timer_on_change_state_rearm, t);
>  
> @@ -3811,28 +3841,6 @@ void main_loop_wait(int timeout)
>  
>      slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));
>  
> -    /* rearm timer, if not periodic */
> -    if (alarm_timer->expired) {
> -        alarm_timer->expired = 0;
> -        qemu_rearm_alarm_timer(alarm_timer);
> -    }
> -
> -    alarm_timer->pending = 0;
> -
> -    /* vm time timers */
> -    if (vm_running) {
> -        if (!cur_cpu || likely(!(cur_cpu->singlestep_enabled & SSTEP_NOTIMER)))
> -            qemu_run_timers(&active_timers[QEMU_CLOCK_VIRTUAL],
> -                            qemu_get_clock(vm_clock));
> -    }
> -
> -    /* real time timers */
> -    qemu_run_timers(&active_timers[QEMU_CLOCK_REALTIME],
> -                    qemu_get_clock(rt_clock));
> -
> -    qemu_run_timers(&active_timers[QEMU_CLOCK_HOST],
> -                    qemu_get_clock(host_clock));
> -
>      /* Check bottom-halves last in case any of the earlier events triggered
>         them.  */
>      qemu_bh_poll();
> @@ -3888,7 +3896,7 @@ static void tcg_cpu_exec(void)
>  
>          if (!vm_running)
>              break;
> -        if (alarm_timer->pending)
> +        if (qemu_alarm_pending())
>              break;
>          if (cpu_can_run(env))
>              ret = qemu_cpu_exec(env);
> -- 
> 1.6.5.2
> 
> 
>
Paolo Bonzini - Dec. 24, 2009, 10:27 a.m.
On 12/23/2009 07:37 PM, Marcelo Tosatti wrote:
> You should probably make sure the bh handling is signal safe. Perhaps
> use atomic test-and-set for bh->schedule on qemu_bh_poll, etc...

The worst thing that can happen is that qemu_bh_poll misses the alarm 
bottom half, and tcg_cpu_exec exits immediately because 
qemu_alarm_pending() returns true.  This is the same that would happen 
without my patches, if the signal was raised during qemu_bh_poll which 
is after the timers were processed.

> Also there's a similar problem with the expired flag
>
>> >  +    /* rearm timer, if not periodic */
>> >  +    if (t->expired) {
>> >  +        t->expired = 0;
>> >  +        qemu_rearm_alarm_timer(t);
>> >  +    }
> (it was there before your patch).

If t->expired is true, the alarm timer is dynticks and host_alarm_timer 
is one-shot, so it is impossible that host_alarm_timer is called before 
qemu_rearm_alarm_timer finishes.  (Except for the Windows bug fixed 
earlier in the series).

> BTW, if host_alarm_handler runs after is t->expired is cleared,
> but before qemu_run_timers->callback->qemu_mod_timer, subsequent
> qemu_mod_timer callbacks fail to rearm the alarm timer, in case
> timer in question expires earlier?

bh->scheduled is set to 0 before executing the bottom half, so 
qemu_mod_timer sees qemu_alarm_pending() == false and does rearm the 
alarm timer.

Cases like this are why it is important to get t->expired vs. 
qemu_alarm_pending right; I had thought of some similar races while 
reviewing the submission, but I admit I hadn't thought about any of 
these particular issues, so thanks for the review and for making me do 
my homework.

> Nice patchset!

Thanks. :-)

Paolo
Marcelo Tosatti - Dec. 24, 2009, 11:25 a.m.
On Thu, Dec 24, 2009 at 11:27:06AM +0100, Paolo Bonzini wrote:
> On 12/23/2009 07:37 PM, Marcelo Tosatti wrote:
>> You should probably make sure the bh handling is signal safe. Perhaps
>> use atomic test-and-set for bh->schedule on qemu_bh_poll, etc...
>
> The worst thing that can happen is that qemu_bh_poll misses the alarm 
> bottom half, and tcg_cpu_exec exits immediately because  
> qemu_alarm_pending() returns true.  This is the same that would happen  
> without my patches, if the signal was raised during qemu_bh_poll which  
> is after the timers were processed.

OK. 

>> Also there's a similar problem with the expired flag
>>
>>> >  +    /* rearm timer, if not periodic */
>>> >  +    if (t->expired) {
>>> >  +        t->expired = 0;
>>> >  +        qemu_rearm_alarm_timer(t);
>>> >  +    }
>> (it was there before your patch).
>
> If t->expired is true, the alarm timer is dynticks and host_alarm_timer  
> is one-shot, so it is impossible that host_alarm_timer is called before  
> qemu_rearm_alarm_timer finishes.  (Except for the Windows bug fixed  
> earlier in the series).
>
>> BTW, if host_alarm_handler runs after is t->expired is cleared,
>> but before qemu_run_timers->callback->qemu_mod_timer, subsequent
>> qemu_mod_timer callbacks fail to rearm the alarm timer, in case
>> timer in question expires earlier?
>
> bh->scheduled is set to 0 before executing the bottom half, so  
> qemu_mod_timer sees qemu_alarm_pending() == false and does rearm the  
> alarm timer.

static void qemu_timer_bh(void *opaque)
{
    struct qemu_alarm_timer *t = opaque;

    /* rearm timer, if not periodic */
    if (t->expired) {
        t->expired = 0;
        qemu_rearm_alarm_timer(t);
    }

    -> host_alarm_handler fires, sets bh->scheduled = 1
    -> qemu_mod_timer sees qemu_alarm_pending() == true and does
       not rearm the alarm timer.

    /* vm time timers */
    if (vm_running) {
        if (!cur_cpu || likely(!(cur_cpu->singlestep_enabled & STEP_NOTIMER)))
            qemu_run_timers(&active_timers[QEMU_CLOCK_VIRTUAL],
                            qemu_get_clock(vm_clock));
    }

    /* real time timers */
    qemu_run_timers(&active_timers[QEMU_CLOCK_REALTIME],
                    qemu_get_clock(rt_clock));

    qemu_run_timers(&active_timers[QEMU_CLOCK_HOST],
                    qemu_get_clock(host_clock));
}


>
> Cases like this are why it is important to get t->expired vs.  
> qemu_alarm_pending right; I had thought of some similar races while  
> reviewing the submission, but I admit I hadn't thought about any of  
> these particular issues, so thanks for the review and for making me do  
> my homework.

The purpose of t->expired is somewhat unclear to me (and similarly 
in the current code). Why not simply leave the rearm decision
to qemu_mod_timer? (and kill the explicit qemu_rearm_alarm_timer from 
the timer bh handler). Maybe for a future patch...
Paolo Bonzini - Dec. 24, 2009, 12:21 p.m.
On 12/24/2009 12:25 PM, Marcelo Tosatti wrote:
>      /* rearm timer, if not periodic */
>      if (t->expired) {
>          t->expired = 0;
>          qemu_rearm_alarm_timer(t);
>      }
>
>      ->  host_alarm_handler fires, sets bh->scheduled = 1
>      ->  qemu_mod_timer sees qemu_alarm_pending() == true and does
>         not rearm the alarm timer.

The bottom half will be reexecuted right away and the alarm will be 
rearmed then.  So if it happens once it is not a problem.  If it happens 
continuously, you have a sort of interrupt storm so this is the least of 
your problems. :-)

Paolo
Jamie Lokier - Jan. 4, 2010, 7:38 p.m.
Anthony Liguori wrote:
> introduces a subtle semantic change.  Previously, timers always ran 
> before bottom halves whereas after this change, timers may run after 
> some bottoms halves but before others.  While this should be okay in 
> principle, in practice, I'm sure it'll introduce regressions.  I'd be 
> very surprised if cris wasn't affected by this.

In principle, if it does affect something, it seems likely there is
already a buggy race condition.  After all, if the timer and bottom
half could trigger at the same time, which is the condition where the
order is significant, then in principle the timer could have triggered
slightly later because it depends on the host alarm behaviour.

> But more importantly, I think timer dispatch needs to be part of the 
> select loop.  malc has a git tree that replaces host alarm timers with 
> select() timeouts.  This has a lot of really nice properties like it 
> eliminates the need for signals and EINTR handling.  A move like this 
> would likely make this more difficult.

I agree that select loop is better, but don't you still need a host
alarm signal for when the guest is running for a long time without
trapping?

-- Jamie
Michael S. Tsirkin - Jan. 4, 2010, 8:01 p.m.
On Mon, Jan 04, 2010 at 02:24:53PM -0600, Anthony Liguori wrote:
> On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
>> Make the timer subsystem register its own bottom half instead of
>> placing the bottom half code in the heart of the main loop.  To
>> test if an alarm timer is pending, just check if the bottom half is
>> scheduled.
>>
>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
>
> I'm not a huge fan of this for a couple reasons.  The first is that it  
> introduces a subtle semantic change.  Previously, timers always ran  
> before bottom halves whereas after this change, timers may run after  
> some bottoms halves but before others.  While this should be okay in  
> principle, in practice, I'm sure it'll introduce regressions.  I'd be  
> very surprised if cris wasn't affected by this.
>
> But more importantly, I think timer dispatch needs to be part of the  
> select loop.  malc has a git tree that replaces host alarm timers with  
> select() timeouts.

Where is that tree?

IMO we need that, I am not sure all code is as signal-safe
as it should be. At least crashes that I saw with winxp install
seem to be related to signal handling.

>  This has a lot of really nice properties like it  
> eliminates the need for signals and EINTR handling.  A move like this  
> would likely make this more difficult.
>
> I think the opposite sort of move makes more sense.  Treating bottom  
> halves as 0-duration timer events.  Unfortunately, things like cris do  
> not handle this sort of change very well.
>
> Regards,
>
> Anthony Liguori
>
Paolo Bonzini - Jan. 4, 2010, 8:01 p.m.
On 01/04/2010 09:24 PM, Anthony Liguori wrote:
>
> I'm not a huge fan of this for a couple reasons.  The first is that
> it introduces a subtle semantic change.  Previously, timers always
> ran before bottom halves whereas after this change, timers may run
> after some bottoms halves but before others.

I see what you mean, and you are right: qemu_bh_new adds a
bottom half at the beginning of the queue, so it's pretty much
guaranteed that a ptimer's bottom half will run _before_ the alarm timer's.

There are three possible fixes:

1) make async.c use a tail queue.  Fixes the bug, but it is too clever IMHO.

2) in tcg_exec, where there is

         if (timer_alarm_pending) {
             timer_alarm_pending = 0;
             break;
         }

instead check if any bottom half is scheduled.  With this change, after 
the timers run, if the ptimer's bottom half hadn't run TCG would not 
execute code, qemu_bh_calculate_timeout would make main_loop_wait 
nonblocking, and the ptimer's bottom half would execute right away.

BTW after my series the above check will test whether the timer bottom 
half is scheduled, so in some sense this could be considered a bugfix 
that would be placed _very early_ in the series or could even go in 
independently.

3) Both of the above.  2 would provide the fix and 1 would provide a 
performance improvement by avoiding the useless looping.

> But more importantly, I think timer dispatch needs to be part of the
> select loop.  malc has a git tree that replaces host alarm timers
> with select() timeouts.  This has a lot of really nice properties
> like it eliminates the need for signals and EINTR handling.  A move
> like this would likely make this more difficult.

Not necessarily, or at least, splitting qemu-timer.c may make it 
marginally more difficult but not having a bottom half for timers.

With qemu-timer.c split you'd need something like

    if (rc == 0) host_alarm_handler ();

after the select loop.  I suppose you could basically revert this patch 
and move timer handling into host_alarm_handler, but the code should 
work independent of this patch.  This patch (modulo your other 
objection) just adds a level of indirection but doesn't change the 
overall structure of the main loop.

Paolo
Anthony Liguori - Jan. 4, 2010, 8:24 p.m.
On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
> Make the timer subsystem register its own bottom half instead of
> placing the bottom half code in the heart of the main loop.  To
> test if an alarm timer is pending, just check if the bottom half is
> scheduled.
>
> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>

I'm not a huge fan of this for a couple reasons.  The first is that it 
introduces a subtle semantic change.  Previously, timers always ran 
before bottom halves whereas after this change, timers may run after 
some bottoms halves but before others.  While this should be okay in 
principle, in practice, I'm sure it'll introduce regressions.  I'd be 
very surprised if cris wasn't affected by this.

But more importantly, I think timer dispatch needs to be part of the 
select loop.  malc has a git tree that replaces host alarm timers with 
select() timeouts.  This has a lot of really nice properties like it 
eliminates the need for signals and EINTR handling.  A move like this 
would likely make this more difficult.

I think the opposite sort of move makes more sense.  Treating bottom 
halves as 0-duration timer events.  Unfortunately, things like cris do 
not handle this sort of change very well.

Regards,

Anthony Liguori
Anthony Liguori - Jan. 4, 2010, 11:54 p.m.
On 01/04/2010 02:01 PM, Michael S. Tsirkin wrote:
> On Mon, Jan 04, 2010 at 02:24:53PM -0600, Anthony Liguori wrote:
>    
>> On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
>>      
>>> Make the timer subsystem register its own bottom half instead of
>>> placing the bottom half code in the heart of the main loop.  To
>>> test if an alarm timer is pending, just check if the bottom half is
>>> scheduled.
>>>
>>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
>>>        
>> I'm not a huge fan of this for a couple reasons.  The first is that it
>> introduces a subtle semantic change.  Previously, timers always ran
>> before bottom halves whereas after this change, timers may run after
>> some bottoms halves but before others.  While this should be okay in
>> principle, in practice, I'm sure it'll introduce regressions.  I'd be
>> very surprised if cris wasn't affected by this.
>>
>> But more importantly, I think timer dispatch needs to be part of the
>> select loop.  malc has a git tree that replaces host alarm timers with
>> select() timeouts.
>>      
> Where is that tree?
>    

http://repo.or.cz/w/qemu/malc.git  mtloop

> IMO we need that, I am not sure all code is as signal-safe
> as it should be. At least crashes that I saw with winxp install
> seem to be related to signal handling.
>    

Regards,

Anthony Liguori
Anthony Liguori - Jan. 4, 2010, 11:59 p.m.
On 01/04/2010 02:01 PM, Paolo Bonzini wrote:
> On 01/04/2010 09:24 PM, Anthony Liguori wrote:
>>
>> I'm not a huge fan of this for a couple reasons.  The first is that
>> it introduces a subtle semantic change.  Previously, timers always
>> ran before bottom halves whereas after this change, timers may run
>> after some bottoms halves but before others.
>
> I see what you mean, and you are right: qemu_bh_new adds a
> bottom half at the beginning of the queue, so it's pretty much
> guaranteed that a ptimer's bottom half will run _before_ the alarm 
> timer's.
>
> There are three possible fixes:
>
> 1) make async.c use a tail queue.  Fixes the bug, but it is too clever 
> IMHO.
>
> 2) in tcg_exec, where there is
>
>         if (timer_alarm_pending) {
>             timer_alarm_pending = 0;
>             break;
>         }
>
> instead check if any bottom half is scheduled.  With this change, 
> after the timers run, if the ptimer's bottom half hadn't run TCG would 
> not execute code, qemu_bh_calculate_timeout would make main_loop_wait 
> nonblocking, and the ptimer's bottom half would execute right away.
>
> BTW after my series the above check will test whether the timer bottom 
> half is scheduled, so in some sense this could be considered a bugfix 
> that would be placed _very early_ in the series or could even go in 
> independently.
>
> 3) Both of the above.  2 would provide the fix and 1 would provide a 
> performance improvement by avoiding the useless looping.
>
>> But more importantly, I think timer dispatch needs to be part of the
>> select loop.  malc has a git tree that replaces host alarm timers
>> with select() timeouts.  This has a lot of really nice properties
>> like it eliminates the need for signals and EINTR handling.  A move
>> like this would likely make this more difficult.
>
> Not necessarily, or at least, splitting qemu-timer.c may make it 
> marginally more difficult but not having a bottom half for timers.

When I think of a main loop, I think of something that provides the 
following three services

1) fd based events
2) time based events
3) an idle callback

The problem we have is that bottom halves aren't quite idle callbacks.  
They have some really weird properties about the guarantees of when 
they're executed.  At any rate, you really cannot express a time based 
event as an idle callback because you need to setup the select() timeout 
based on the next deadline and then dispatch timer event based on 
selects return.  idle functions have none of this ability.

> With qemu-timer.c split you'd need something like
>
>    if (rc == 0) host_alarm_handler ();
>
> after the select loop.  I suppose you could basically revert this 
> patch and move timer handling into host_alarm_handler, but the code 
> should work independent of this patch.  This patch (modulo your other 
> objection) just adds a level of indirection but doesn't change the 
> overall structure of the main loop.

Yeah, I'm not at all opposed to the qemu-timer.c split.  What I would 
suggest is just to continue calling timers explicitly instead of trying 
to make them bottoms halves.  I've played with this in the past and the 
regressions are really nasty to track down.  If we're going to make such 
a change, I'd rather spend that regression-busting effort on mtloop or 
completely changing the bottom half semantics to be idle callbacks.

Regards,

Anthony Liguori

> Paolo
Paolo Bonzini - Jan. 5, 2010, 8:38 a.m.
On 01/04/2010 08:38 PM, Jamie Lokier wrote:
> In principle, if it does affect something, it seems likely there is
> already a buggy race condition.  After all, if the timer and bottom
> half could trigger at the same time, which is the condition where the
> order is significant, then in principle the timer could have triggered
> slightly later because it depends on the host alarm behaviour.

No, the problem is when the timer function is _itself_ scheduling a 
bottom half.  Before my patch there was a guarantee that the bh would 
run before TCG, now there is not.  It can be fixed easily though.

Paolo
Michael S. Tsirkin - Jan. 5, 2010, 12:07 p.m.
On Mon, Jan 04, 2010 at 05:54:13PM -0600, Anthony Liguori wrote:
> On 01/04/2010 02:01 PM, Michael S. Tsirkin wrote:
>> On Mon, Jan 04, 2010 at 02:24:53PM -0600, Anthony Liguori wrote:
>>    
>>> On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
>>>      
>>>> Make the timer subsystem register its own bottom half instead of
>>>> placing the bottom half code in the heart of the main loop.  To
>>>> test if an alarm timer is pending, just check if the bottom half is
>>>> scheduled.
>>>>
>>>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
>>>>        
>>> I'm not a huge fan of this for a couple reasons.  The first is that it
>>> introduces a subtle semantic change.  Previously, timers always ran
>>> before bottom halves whereas after this change, timers may run after
>>> some bottoms halves but before others.  While this should be okay in
>>> principle, in practice, I'm sure it'll introduce regressions.  I'd be
>>> very surprised if cris wasn't affected by this.
>>>
>>> But more importantly, I think timer dispatch needs to be part of the
>>> select loop.  malc has a git tree that replaces host alarm timers with
>>> select() timeouts.
>>>      
>> Where is that tree?
>>    
>
> http://repo.or.cz/w/qemu/malc.git  mtloop

Don't seem to see anything there.
malc?

>> IMO we need that, I am not sure all code is as signal-safe
>> as it should be. At least crashes that I saw with winxp install
>> seem to be related to signal handling.
>>    
>
> Regards,
>
> Anthony Liguori
Paolo Bonzini - Jan. 5, 2010, 12:48 p.m.
On 01/05/2010 12:59 AM, Anthony Liguori wrote:
>
> When I think of a main loop, I think of something that provides the
> following three services
>
> 1) fd based events
> 2) time based events
> 3) an idle callback
>
> The problem we have is that bottom halves aren't quite idle callbacks.
> They have some really weird properties about the guarantees of when
> they're executed.  At any rate, you really cannot express a time based
> event as an idle callback because you need to setup the select() timeout
> based on the next deadline and then dispatch timer event based on
> selects return.  idle functions have none of this ability.

I see bottom halves as a fourth service, providing the ability to 
synchronize stuff that needs to execute in a particular thread 
(guaranteeing thread-safety and especially signal-safety).

In some sense, the problem is that bottom halves conflate this service 
_and_ idle callbacks, and that is the weird property.  Idle callbacks 
are used basically only for DMA, and real-world DMA does not have at all 
the semantics that qemu_bh_schedule_idle provides (which in turn is very 
tricky and not exactly what comments in qemu_bh_update_timeout promise).

Compared to qemu_bh_schedule_idle, bottom halves have sane semantics. 
It would be nicer IMO to remove qemu_bh_schedule_idle, have first-class 
idle callbacks, and leave bottom halves as they are now.  I'll put that 
on the todo list. :-)

That said, I'll try to replace this patch with one that doesn't use a 
bottom half.

Paolo
Anthony Liguori - Jan. 5, 2010, 1:06 p.m.
On 01/05/2010 06:48 AM, Paolo Bonzini wrote:
> On 01/05/2010 12:59 AM, Anthony Liguori wrote:
>>
>> When I think of a main loop, I think of something that provides the
>> following three services
>>
>> 1) fd based events
>> 2) time based events
>> 3) an idle callback
>>
>> The problem we have is that bottom halves aren't quite idle callbacks.
>> They have some really weird properties about the guarantees of when
>> they're executed.  At any rate, you really cannot express a time based
>> event as an idle callback because you need to setup the select() timeout
>> based on the next deadline and then dispatch timer event based on
>> selects return.  idle functions have none of this ability.
>
> I see bottom halves as a fourth service, providing the ability to 
> synchronize stuff that needs to execute in a particular thread 
> (guaranteeing thread-safety and especially signal-safety).

Thread and signal safety are slightly different.  Scheduling an idle 
callback from a signal handler is a pretty reasonable thing to do.

Executing no a different thread is a whole different matter.  Right now, 
we really use bottom halves to run something on the io thread and have a 
different mechanism that ran on a specific thread.  Right now, we mix 
that between bottom halves and on_vcpu.

> In some sense, the problem is that bottom halves conflate this service 
> _and_ idle callbacks, and that is the weird property.  Idle callbacks 
> are used basically only for DMA, and real-world DMA does not have at 
> all the semantics that qemu_bh_schedule_idle provides (which in turn 
> is very tricky and not exactly what comments in qemu_bh_update_timeout 
> promise).

I think before we can do any major surgery, we need to redo the way that 
legacy DMA is handled so that it doesn't require constant polling as it 
does now.

Regards,

Anthony Liguori
malc - Jan. 5, 2010, 3:23 p.m.
On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:

> On Mon, Jan 04, 2010 at 05:54:13PM -0600, Anthony Liguori wrote:
> > On 01/04/2010 02:01 PM, Michael S. Tsirkin wrote:
> >> On Mon, Jan 04, 2010 at 02:24:53PM -0600, Anthony Liguori wrote:
> >>    
> >>> On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
> >>>      
> >>>> Make the timer subsystem register its own bottom half instead of
> >>>> placing the bottom half code in the heart of the main loop.  To
> >>>> test if an alarm timer is pending, just check if the bottom half is
> >>>> scheduled.
> >>>>
> >>>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> >>>>        
> >>> I'm not a huge fan of this for a couple reasons.  The first is that it
> >>> introduces a subtle semantic change.  Previously, timers always ran
> >>> before bottom halves whereas after this change, timers may run after
> >>> some bottoms halves but before others.  While this should be okay in
> >>> principle, in practice, I'm sure it'll introduce regressions.  I'd be
> >>> very surprised if cris wasn't affected by this.
> >>>
> >>> But more importantly, I think timer dispatch needs to be part of the
> >>> select loop.  malc has a git tree that replaces host alarm timers with
> >>> select() timeouts.
> >>>      
> >> Where is that tree?
> >>    
> >
> > http://repo.or.cz/w/qemu/malc.git  mtloop
> 
> Don't seem to see anything there.
> malc?

Yes?
 
> >> IMO we need that, I am not sure all code is as signal-safe
> >> as it should be. At least crashes that I saw with winxp install
> >> seem to be related to signal handling.
> >>    
> >
> > Regards,
> >
> > Anthony Liguori
>
Michael S. Tsirkin - Jan. 5, 2010, 3:23 p.m.
On Tue, Jan 05, 2010 at 06:23:34PM +0300, malc wrote:
> On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:
> 
> > On Mon, Jan 04, 2010 at 05:54:13PM -0600, Anthony Liguori wrote:
> > > On 01/04/2010 02:01 PM, Michael S. Tsirkin wrote:
> > >> On Mon, Jan 04, 2010 at 02:24:53PM -0600, Anthony Liguori wrote:
> > >>    
> > >>> On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
> > >>>      
> > >>>> Make the timer subsystem register its own bottom half instead of
> > >>>> placing the bottom half code in the heart of the main loop.  To
> > >>>> test if an alarm timer is pending, just check if the bottom half is
> > >>>> scheduled.
> > >>>>
> > >>>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> > >>>>        
> > >>> I'm not a huge fan of this for a couple reasons.  The first is that it
> > >>> introduces a subtle semantic change.  Previously, timers always ran
> > >>> before bottom halves whereas after this change, timers may run after
> > >>> some bottoms halves but before others.  While this should be okay in
> > >>> principle, in practice, I'm sure it'll introduce regressions.  I'd be
> > >>> very surprised if cris wasn't affected by this.
> > >>>
> > >>> But more importantly, I think timer dispatch needs to be part of the
> > >>> select loop.  malc has a git tree that replaces host alarm timers with
> > >>> select() timeouts.
> > >>>      
> > >> Where is that tree?
> > >>    
> > >
> > > http://repo.or.cz/w/qemu/malc.git  mtloop
> > 
> > Don't seem to see anything there.
> > malc?
> 
> Yes?

Do you have a patch to switch from signals to select?  If yes could you
tell me where it is so  I can test whether it fixes winxp install
crashes I see?


> > >> IMO we need that, I am not sure all code is as signal-safe
> > >> as it should be. At least crashes that I saw with winxp install
> > >> seem to be related to signal handling.
> > >>    
> > >
> > > Regards,
> > >
> > > Anthony Liguori
> > 
> 
> -- 
> mailto:av1474@comtv.ru
malc - Jan. 5, 2010, 3:32 p.m.
On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:

> On Tue, Jan 05, 2010 at 06:23:34PM +0300, malc wrote:
> > On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:
> > 
> > > On Mon, Jan 04, 2010 at 05:54:13PM -0600, Anthony Liguori wrote:
> > > > On 01/04/2010 02:01 PM, Michael S. Tsirkin wrote:
> > > >> On Mon, Jan 04, 2010 at 02:24:53PM -0600, Anthony Liguori wrote:
> > > >>    
> > > >>> On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
> > > >>>      
> > > >>>> Make the timer subsystem register its own bottom half instead of
> > > >>>> placing the bottom half code in the heart of the main loop.  To
> > > >>>> test if an alarm timer is pending, just check if the bottom half is
> > > >>>> scheduled.
> > > >>>>
> > > >>>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> > > >>>>        
> > > >>> I'm not a huge fan of this for a couple reasons.  The first is that it
> > > >>> introduces a subtle semantic change.  Previously, timers always ran
> > > >>> before bottom halves whereas after this change, timers may run after
> > > >>> some bottoms halves but before others.  While this should be okay in
> > > >>> principle, in practice, I'm sure it'll introduce regressions.  I'd be
> > > >>> very surprised if cris wasn't affected by this.
> > > >>>
> > > >>> But more importantly, I think timer dispatch needs to be part of the
> > > >>> select loop.  malc has a git tree that replaces host alarm timers with
> > > >>> select() timeouts.
> > > >>>      
> > > >> Where is that tree?
> > > >>    
> > > >
> > > > http://repo.or.cz/w/qemu/malc.git  mtloop
> > > 
> > > Don't seem to see anything there.
> > > malc?
> > 
> > Yes?
> 
> Do you have a patch to switch from signals to select?  If yes could you
> tell me where it is so  I can test whether it fixes winxp install
> crashes I see?

All i have is published at:
http://repo.or.cz/w/qemu/malc.git/shortlog/refs/heads/mtloop

Specifically:
http://repo.or.cz/w/qemu/malc.git/commit/b99fef4dc2f1ce9d436791b6821cb30e6335ee9b

A small fix is needed to make it run with KVM though.

> 
> > > >> IMO we need that, I am not sure all code is as signal-safe
> > > >> as it should be. At least crashes that I saw with winxp install
> > > >> seem to be related to signal handling.
> > > >>    
> > > >
> > > > Regards,
> > > >
> > > > Anthony Liguori
> > > 
> > 
> > -- 
> > mailto:av1474@comtv.ru
>
Michael S. Tsirkin - Jan. 5, 2010, 3:33 p.m.
On Tue, Jan 05, 2010 at 06:32:09PM +0300, malc wrote:
> On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:
> 
> > On Tue, Jan 05, 2010 at 06:23:34PM +0300, malc wrote:
> > > On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:
> > > 
> > > > On Mon, Jan 04, 2010 at 05:54:13PM -0600, Anthony Liguori wrote:
> > > > > On 01/04/2010 02:01 PM, Michael S. Tsirkin wrote:
> > > > >> On Mon, Jan 04, 2010 at 02:24:53PM -0600, Anthony Liguori wrote:
> > > > >>    
> > > > >>> On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
> > > > >>>      
> > > > >>>> Make the timer subsystem register its own bottom half instead of
> > > > >>>> placing the bottom half code in the heart of the main loop.  To
> > > > >>>> test if an alarm timer is pending, just check if the bottom half is
> > > > >>>> scheduled.
> > > > >>>>
> > > > >>>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> > > > >>>>        
> > > > >>> I'm not a huge fan of this for a couple reasons.  The first is that it
> > > > >>> introduces a subtle semantic change.  Previously, timers always ran
> > > > >>> before bottom halves whereas after this change, timers may run after
> > > > >>> some bottoms halves but before others.  While this should be okay in
> > > > >>> principle, in practice, I'm sure it'll introduce regressions.  I'd be
> > > > >>> very surprised if cris wasn't affected by this.
> > > > >>>
> > > > >>> But more importantly, I think timer dispatch needs to be part of the
> > > > >>> select loop.  malc has a git tree that replaces host alarm timers with
> > > > >>> select() timeouts.
> > > > >>>      
> > > > >> Where is that tree?
> > > > >>    
> > > > >
> > > > > http://repo.or.cz/w/qemu/malc.git  mtloop
> > > > 
> > > > Don't seem to see anything there.
> > > > malc?
> > > 
> > > Yes?
> > 
> > Do you have a patch to switch from signals to select?  If yes could you
> > tell me where it is so  I can test whether it fixes winxp install
> > crashes I see?
> 
> All i have is published at:
> http://repo.or.cz/w/qemu/malc.git/shortlog/refs/heads/mtloop
> 
> Specifically:
> http://repo.or.cz/w/qemu/malc.git/commit/b99fef4dc2f1ce9d436791b6821cb30e6335ee9b
> 
> A small fix is needed to make it run with KVM though.

Hmm, I guess at least iothread needs to be put back for KVM?
malc - Jan. 5, 2010, 3:39 p.m.
On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:

> On Tue, Jan 05, 2010 at 06:32:09PM +0300, malc wrote:
> > On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:
> > 
> > > On Tue, Jan 05, 2010 at 06:23:34PM +0300, malc wrote:
> > > > On Tue, 5 Jan 2010, Michael S. Tsirkin wrote:
> > > > 
> > > > > On Mon, Jan 04, 2010 at 05:54:13PM -0600, Anthony Liguori wrote:
> > > > > > On 01/04/2010 02:01 PM, Michael S. Tsirkin wrote:
> > > > > >> On Mon, Jan 04, 2010 at 02:24:53PM -0600, Anthony Liguori wrote:
> > > > > >>    
> > > > > >>> On 12/21/2009 02:09 AM, Paolo Bonzini wrote:
> > > > > >>>      
> > > > > >>>> Make the timer subsystem register its own bottom half instead of
> > > > > >>>> placing the bottom half code in the heart of the main loop.  To
> > > > > >>>> test if an alarm timer is pending, just check if the bottom half is
> > > > > >>>> scheduled.
> > > > > >>>>
> > > > > >>>> Signed-off-by: Paolo Bonzini<pbonzini@redhat.com>
> > > > > >>>>        
> > > > > >>> I'm not a huge fan of this for a couple reasons.  The first is that it
> > > > > >>> introduces a subtle semantic change.  Previously, timers always ran
> > > > > >>> before bottom halves whereas after this change, timers may run after
> > > > > >>> some bottoms halves but before others.  While this should be okay in
> > > > > >>> principle, in practice, I'm sure it'll introduce regressions.  I'd be
> > > > > >>> very surprised if cris wasn't affected by this.
> > > > > >>>
> > > > > >>> But more importantly, I think timer dispatch needs to be part of the
> > > > > >>> select loop.  malc has a git tree that replaces host alarm timers with
> > > > > >>> select() timeouts.
> > > > > >>>      
> > > > > >> Where is that tree?
> > > > > >>    
> > > > > >
> > > > > > http://repo.or.cz/w/qemu/malc.git  mtloop
> > > > > 
> > > > > Don't seem to see anything there.
> > > > > malc?
> > > > 
> > > > Yes?
> > > 
> > > Do you have a patch to switch from signals to select?  If yes could you
> > > tell me where it is so  I can test whether it fixes winxp install
> > > crashes I see?
> > 
> > All i have is published at:
> > http://repo.or.cz/w/qemu/malc.git/shortlog/refs/heads/mtloop
> > 
> > Specifically:
> > http://repo.or.cz/w/qemu/malc.git/commit/b99fef4dc2f1ce9d436791b6821cb30e6335ee9b
> > 
> > A small fix is needed to make it run with KVM though.
> 
> Hmm, I guess at least iothread needs to be put back for KVM?

Nope. One of the sem_waits can actually be interrupted and shouldn't
cause an untimely death of the process.
Jamie Lokier - Jan. 6, 2010, 1:20 a.m.
Anthony Liguori wrote:
> Thread and signal safety are slightly different.

They are very different:

Virtually all libc calls are thread safe, unless they use unsafe
static data APIs.

On the other hand, the number of libc calls that are signal safe is
very limited.

For example, calling printf() is not signal-safe; neither is malloc().

pthread functions are not safe in signal handlers either:
pthread_self, pthread_getspecific, pthread_mutex_lock and
pthread_cond_broadcast, not of them are signal-safe.

In a nutshell, it's dubious to do much inside a signal handler.

Pure system calls tend to be ok, though.

> Scheduling an idle 
> callback from a signal handler is a pretty reasonable thing to do.

Scheduling, yes, by telling the main event loop that it's time.

Running it inside the signal handler... Then you're depending on
non-portabilities among hosts.

-- Jamie

Patch

diff --git a/vl.c b/vl.c
index 78807f5..289aadc 100644
--- a/vl.c
+++ b/vl.c
@@ -573,10 +573,17 @@  struct qemu_alarm_timer {
     void (*rearm)(struct qemu_alarm_timer *t);
     void *priv;
 
+    QEMUBH *bh;
     char expired;
-    char pending;
 };
 
+static struct qemu_alarm_timer *alarm_timer;
+
+static inline int qemu_alarm_pending(void)
+{
+    return qemu_bh_scheduled(alarm_timer->bh);
+}
+
 static inline int alarm_has_dynticks(struct qemu_alarm_timer *t)
 {
     return !!t->rearm;
@@ -593,8 +600,6 @@  static void qemu_rearm_alarm_timer(struct qemu_alarm_timer *t)
 /* TODO: MIN_TIMER_REARM_US should be optimized */
 #define MIN_TIMER_REARM_US 250
 
-static struct qemu_alarm_timer *alarm_timer;
-
 #ifdef _WIN32
 
 struct qemu_alarm_win32 {
@@ -874,7 +879,7 @@  void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)
 
     /* Rearm if necessary  */
     if (pt == &active_timers[ts->clock->type]) {
-        if (!alarm_timer->pending) {
+        if (!qemu_alarm_pending()) {
             qemu_rearm_alarm_timer(alarm_timer);
         }
         /* Interrupt execution to force deadline recalculation.  */
@@ -1001,6 +1006,31 @@  static const VMStateDescription vmstate_timers = {
 
 static void qemu_event_increment(void);
 
+static void qemu_timer_bh(void *opaque)
+{
+    struct qemu_alarm_timer *t = opaque;
+
+    /* rearm timer, if not periodic */
+    if (t->expired) {
+        t->expired = 0;
+        qemu_rearm_alarm_timer(t);
+    }
+
+    /* vm time timers */
+    if (vm_running) {
+        if (!cur_cpu || likely(!(cur_cpu->singlestep_enabled & SSTEP_NOTIMER)))
+            qemu_run_timers(&active_timers[QEMU_CLOCK_VIRTUAL],
+                            qemu_get_clock(vm_clock));
+    }
+
+    /* real time timers */
+    qemu_run_timers(&active_timers[QEMU_CLOCK_REALTIME],
+                    qemu_get_clock(rt_clock));
+
+    qemu_run_timers(&active_timers[QEMU_CLOCK_HOST],
+                    qemu_get_clock(host_clock));
+}
+
 #ifdef _WIN32
 static void CALLBACK host_alarm_handler(UINT uTimerID, UINT uMsg,
                                         DWORD_PTR dwUser, DWORD_PTR dw1,
@@ -1059,8 +1089,7 @@  static void host_alarm_handler(int host_signum)
             cpu_exit(next_cpu);
         }
 #endif
-        t->pending = 1;
-        qemu_notify_event();
+        qemu_bh_schedule(t->bh);
     }
 }
 
@@ -1446,7 +1475,8 @@  static int init_timer_alarm(void)
     }
 
     /* first event is at time 0 */
-    t->pending = 1;
+    t->bh = qemu_bh_new(qemu_timer_bh, t);
+    qemu_bh_schedule(t->bh);
     alarm_timer = t;
     qemu_add_vm_change_state_handler(alarm_timer_on_change_state_rearm, t);
 
@@ -3811,28 +3841,6 @@  void main_loop_wait(int timeout)
 
     slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));
 
-    /* rearm timer, if not periodic */
-    if (alarm_timer->expired) {
-        alarm_timer->expired = 0;
-        qemu_rearm_alarm_timer(alarm_timer);
-    }
-
-    alarm_timer->pending = 0;
-
-    /* vm time timers */
-    if (vm_running) {
-        if (!cur_cpu || likely(!(cur_cpu->singlestep_enabled & SSTEP_NOTIMER)))
-            qemu_run_timers(&active_timers[QEMU_CLOCK_VIRTUAL],
-                            qemu_get_clock(vm_clock));
-    }
-
-    /* real time timers */
-    qemu_run_timers(&active_timers[QEMU_CLOCK_REALTIME],
-                    qemu_get_clock(rt_clock));
-
-    qemu_run_timers(&active_timers[QEMU_CLOCK_HOST],
-                    qemu_get_clock(host_clock));
-
     /* Check bottom-halves last in case any of the earlier events triggered
        them.  */
     qemu_bh_poll();
@@ -3888,7 +3896,7 @@  static void tcg_cpu_exec(void)
 
         if (!vm_running)
             break;
-        if (alarm_timer->pending)
+        if (qemu_alarm_pending())
             break;
         if (cpu_can_run(env))
             ret = qemu_cpu_exec(env);