Patchwork [v3,5/5] hpet 'driftfix': add code in hpet_timer() to compensate delayed callbacks and coalesced interrupts

login
register
mail settings
Submitter Ulrich Obergfell
Date April 28, 2011, 2:25 p.m.
Message ID <1304000700-3640-6-git-send-email-uobergfe@redhat.com>
Download mbox | patch
Permalink /patch/93227/
State New
Headers show

Comments

Ulrich Obergfell - April 28, 2011, 2:25 p.m.
Loss of periodic timer interrupts caused by delayed callbacks and by
interrupt coalescing is compensated by gradually injecting additional
interrupts during subsequent timer intervals, starting at a rate of
one additional interrupt per interval. The injection of additional
interrupts is based on a backlog of unaccounted HPET clock periods
(new HPETTimer field 'ticks_not_accounted'). The backlog increases
due to delayed callbacks and coalesced interrupts, and it decreases
if an interrupt was injected successfully. If the backlog increases
while compensation is still in progress, the rate at which additional
interrupts are injected is increased too. A limit is imposed on the
backlog and on the rate.

Injecting additional timer interrupts to compensate lost interrupts
can alleviate long term time drift. However, on a short time scale,
this method can have the side effect of making virtual machine time
intermittently pass slower and faster than real time (depending on
the guest's time keeping algorithm). Compensation is disabled by
default and can be enabled for guests where this behaviour may be
acceptable.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
---
 hw/hpet.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 61 insertions(+), 2 deletions(-)
Marcelo Tosatti - May 3, 2011, 7:03 p.m.
On Thu, Apr 28, 2011 at 04:25:00PM +0200, Ulrich Obergfell wrote:
> Loss of periodic timer interrupts caused by delayed callbacks and by
> interrupt coalescing is compensated by gradually injecting additional
> interrupts during subsequent timer intervals, starting at a rate of
> one additional interrupt per interval. The injection of additional
> interrupts is based on a backlog of unaccounted HPET clock periods
> (new HPETTimer field 'ticks_not_accounted'). The backlog increases
> due to delayed callbacks and coalesced interrupts, and it decreases
> if an interrupt was injected successfully. If the backlog increases
> while compensation is still in progress, the rate at which additional
> interrupts are injected is increased too. A limit is imposed on the
> backlog and on the rate.
> 
> Injecting additional timer interrupts to compensate lost interrupts
> can alleviate long term time drift. However, on a short time scale,
> this method can have the side effect of making virtual machine time
> intermittently pass slower and faster than real time (depending on
> the guest's time keeping algorithm). Compensation is disabled by
> default and can be enabled for guests where this behaviour may be
> acceptable.
> 
> Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
> ---
>  hw/hpet.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 61 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/hpet.c b/hw/hpet.c
> index 35466ae..92d5f58 100644
> --- a/hw/hpet.c
> +++ b/hw/hpet.c
> @@ -32,6 +32,7 @@
>  #include "sysbus.h"
>  #include "mc146818rtc.h"
>  #include "sysemu.h"
> +#include <assert.h>
>  
>  //#define HPET_DEBUG
>  #ifdef HPET_DEBUG
> @@ -42,6 +43,9 @@
>  
>  #define HPET_MSI_SUPPORT        0
>  
> +#define MAX_TICKS_NOT_ACCOUNTED     (uint64_t)500000000 /* 5 sec */
> +#define MAX_IRQ_RATE                (uint32_t)10
> +
>  struct HPETState;
>  typedef struct HPETTimer {  /* timers */
>      uint8_t tn;             /*timer number*/
> @@ -326,28 +330,63 @@ static const VMStateDescription vmstate_hpet = {
>      }
>  };
>  
> +static bool hpet_timer_has_tick_backlog(HPETTimer *t)
> +{
> +    uint64_t backlog = t->ticks_not_accounted - (t->period + t->prev_period);
> +    return (backlog >= t->period);
> +}
> +
>  /*
>   * timer expiration callback
>   */
>  static void hpet_timer(void *opaque)
>  {
>      HPETTimer *t = opaque;
> +    HPETState *s = t->state;
>      uint64_t diff;
> -
> +    int irq_delivered = 0;
> +    uint32_t irq_count = 0;
>      uint64_t period = t->period;
>      uint64_t cur_tick = hpet_get_ticks(t->state);
>  
> +    if (s->driftfix && !t->ticks_not_accounted) {
> +        t->ticks_not_accounted = t->prev_period = t->period;
> +    }
>      if (timer_is_periodic(t) && period != 0) {
>          if (t->config & HPET_TN_32BIT) {
>              while (hpet_time_after(cur_tick, t->cmp)) {
>                  t->cmp = (uint32_t)(t->cmp + t->period);
> +                t->ticks_not_accounted += t->period;
> +                irq_count++;
>              }
>          } else {
>              while (hpet_time_after64(cur_tick, t->cmp)) {
>                  t->cmp += period;
> +                t->ticks_not_accounted += period;
> +                irq_count++;
>              }
>          }
>          diff = hpet_calculate_diff(t, cur_tick);
> +        if (s->driftfix) {
> +            if (t->ticks_not_accounted > MAX_TICKS_NOT_ACCOUNTED) {
> +                t->ticks_not_accounted = t->period + t->prev_period;
> +            }
> +            if (hpet_timer_has_tick_backlog(t)) {
> +                if (t->irq_rate == 1 || irq_count > 1) {
> +                    t->irq_rate++;
> +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> +                }
> +                if (t->divisor == 0) {
> +                    assert(irq_count);
> +                }
> +                if (irq_count) {
> +                    t->divisor = t->irq_rate;
> +                }
> +                diff /= t->divisor--;
> +            } else {
> +                t->irq_rate = 1;
> +            }
> +        }
>          qemu_mod_timer(t->qemu_timer,
>                         qemu_get_clock_ns(vm_clock) + (int64_t)ticks_to_ns(diff));
>      } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
> @@ -358,7 +397,22 @@ static void hpet_timer(void *opaque)
>              t->wrap_flag = 0;
>          }
>      }
> -    update_irq(t, 1);
> +    if (s->driftfix && timer_is_periodic(t) && period != 0) {
> +        if (t->ticks_not_accounted >= t->period + t->prev_period) {
> +            irq_delivered = update_irq(t, 1);
> +            if (irq_delivered) {
> +                t->ticks_not_accounted -= t->prev_period;
> +                t->prev_period = t->period;
> +            } else {
> +                if (irq_count) {
> +                    t->irq_rate++;
> +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> +                }
> +            }
> +        }
> +    } else {
> +        update_irq(t, 1);
> +    }
>  }

Hi Ulrich,

Whats prev_period for, since in practice the period will not change
between interrupts (OS programs comparator once, or perhaps twice during
bootup) ?

Other than that, shouldnt reset accounting variables to init state on
write to GLOBAL_ENABLE_CFG / writes to main counter?
Glauber Costa - May 3, 2011, 10:08 p.m.
On Tue, 2011-05-03 at 16:03 -0300, Marcelo Tosatti wrote:
> On Thu, Apr 28, 2011 at 04:25:00PM +0200, Ulrich Obergfell wrote:
> > Loss of periodic timer interrupts caused by delayed callbacks and by
> > interrupt coalescing is compensated by gradually injecting additional
> > interrupts during subsequent timer intervals, starting at a rate of
> > one additional interrupt per interval. The injection of additional
> > interrupts is based on a backlog of unaccounted HPET clock periods
> > (new HPETTimer field 'ticks_not_accounted'). The backlog increases
> > due to delayed callbacks and coalesced interrupts, and it decreases
> > if an interrupt was injected successfully. If the backlog increases
> > while compensation is still in progress, the rate at which additional
> > interrupts are injected is increased too. A limit is imposed on the
> > backlog and on the rate.
> > 
> > Injecting additional timer interrupts to compensate lost interrupts
> > can alleviate long term time drift. However, on a short time scale,
> > this method can have the side effect of making virtual machine time
> > intermittently pass slower and faster than real time (depending on
> > the guest's time keeping algorithm). Compensation is disabled by
> > default and can be enabled for guests where this behaviour may be
> > acceptable.
> > 
> > Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
> > ---
> >  hw/hpet.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 files changed, 61 insertions(+), 2 deletions(-)
> > 
> > diff --git a/hw/hpet.c b/hw/hpet.c
> > index 35466ae..92d5f58 100644
> > --- a/hw/hpet.c
> > +++ b/hw/hpet.c
> > @@ -32,6 +32,7 @@
> >  #include "sysbus.h"
> >  #include "mc146818rtc.h"
> >  #include "sysemu.h"
> > +#include <assert.h>
> >  
> >  //#define HPET_DEBUG
> >  #ifdef HPET_DEBUG
> > @@ -42,6 +43,9 @@
> >  
> >  #define HPET_MSI_SUPPORT        0
> >  
> > +#define MAX_TICKS_NOT_ACCOUNTED     (uint64_t)500000000 /* 5 sec */
> > +#define MAX_IRQ_RATE                (uint32_t)10
> > +
> >  struct HPETState;
> >  typedef struct HPETTimer {  /* timers */
> >      uint8_t tn;             /*timer number*/
> > @@ -326,28 +330,63 @@ static const VMStateDescription vmstate_hpet = {
> >      }
> >  };
> >  
> > +static bool hpet_timer_has_tick_backlog(HPETTimer *t)
> > +{
> > +    uint64_t backlog = t->ticks_not_accounted - (t->period + t->prev_period);
> > +    return (backlog >= t->period);
> > +}
> > +
> >  /*
> >   * timer expiration callback
> >   */
> >  static void hpet_timer(void *opaque)
> >  {
> >      HPETTimer *t = opaque;
> > +    HPETState *s = t->state;
> >      uint64_t diff;
> > -
> > +    int irq_delivered = 0;
> > +    uint32_t irq_count = 0;
> >      uint64_t period = t->period;
> >      uint64_t cur_tick = hpet_get_ticks(t->state);
> >  
> > +    if (s->driftfix && !t->ticks_not_accounted) {
> > +        t->ticks_not_accounted = t->prev_period = t->period;
> > +    }
> >      if (timer_is_periodic(t) && period != 0) {
> >          if (t->config & HPET_TN_32BIT) {
> >              while (hpet_time_after(cur_tick, t->cmp)) {
> >                  t->cmp = (uint32_t)(t->cmp + t->period);
> > +                t->ticks_not_accounted += t->period;
> > +                irq_count++;
> >              }
> >          } else {
> >              while (hpet_time_after64(cur_tick, t->cmp)) {
> >                  t->cmp += period;
> > +                t->ticks_not_accounted += period;
> > +                irq_count++;
> >              }
> >          }
> >          diff = hpet_calculate_diff(t, cur_tick);
> > +        if (s->driftfix) {
> > +            if (t->ticks_not_accounted > MAX_TICKS_NOT_ACCOUNTED) {
> > +                t->ticks_not_accounted = t->period + t->prev_period;
> > +            }
> > +            if (hpet_timer_has_tick_backlog(t)) {
> > +                if (t->irq_rate == 1 || irq_count > 1) {
> > +                    t->irq_rate++;
> > +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> > +                }
> > +                if (t->divisor == 0) {
> > +                    assert(irq_count);
> > +                }
> > +                if (irq_count) {
> > +                    t->divisor = t->irq_rate;
> > +                }
> > +                diff /= t->divisor--;
> > +            } else {
> > +                t->irq_rate = 1;
> > +            }
> > +        }
> >          qemu_mod_timer(t->qemu_timer,
> >                         qemu_get_clock_ns(vm_clock) + (int64_t)ticks_to_ns(diff));
> >      } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
> > @@ -358,7 +397,22 @@ static void hpet_timer(void *opaque)
> >              t->wrap_flag = 0;
> >          }
> >      }
> > -    update_irq(t, 1);
> > +    if (s->driftfix && timer_is_periodic(t) && period != 0) {
> > +        if (t->ticks_not_accounted >= t->period + t->prev_period) {
> > +            irq_delivered = update_irq(t, 1);
> > +            if (irq_delivered) {
> > +                t->ticks_not_accounted -= t->prev_period;
> > +                t->prev_period = t->period;
> > +            } else {
> > +                if (irq_count) {
> > +                    t->irq_rate++;
> > +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> > +                }
> > +            }
> > +        }
> > +    } else {
> > +        update_irq(t, 1);
> > +    }
> >  }
> 
> Hi Ulrich,
> 
> Whats prev_period for, since in practice the period will not change
> between interrupts (OS programs comparator once, or perhaps twice during
> bootup) ?

Actually, some systems, like Windows 2000-and-something change this
period quite heavily under multimedia workloads.

> Other than that, shouldnt reset accounting variables to init state on
> write to GLOBAL_ENABLE_CFG / writes to main counter?
>
Marcelo Tosatti - May 3, 2011, 10:55 p.m.
On Tue, May 03, 2011 at 07:08:25PM -0300, Glauber Costa wrote:
> On Tue, 2011-05-03 at 16:03 -0300, Marcelo Tosatti wrote:
> > On Thu, Apr 28, 2011 at 04:25:00PM +0200, Ulrich Obergfell wrote:
> > > Loss of periodic timer interrupts caused by delayed callbacks and by
> > > interrupt coalescing is compensated by gradually injecting additional
> > > interrupts during subsequent timer intervals, starting at a rate of
> > > one additional interrupt per interval. The injection of additional
> > > interrupts is based on a backlog of unaccounted HPET clock periods
> > > (new HPETTimer field 'ticks_not_accounted'). The backlog increases
> > > due to delayed callbacks and coalesced interrupts, and it decreases
> > > if an interrupt was injected successfully. If the backlog increases
> > > while compensation is still in progress, the rate at which additional
> > > interrupts are injected is increased too. A limit is imposed on the
> > > backlog and on the rate.
> > > 
> > > Injecting additional timer interrupts to compensate lost interrupts
> > > can alleviate long term time drift. However, on a short time scale,
> > > this method can have the side effect of making virtual machine time
> > > intermittently pass slower and faster than real time (depending on
> > > the guest's time keeping algorithm). Compensation is disabled by
> > > default and can be enabled for guests where this behaviour may be
> > > acceptable.
> > > 
> > > Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
> > > ---
> > >  hw/hpet.c |   63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > >  1 files changed, 61 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/hw/hpet.c b/hw/hpet.c
> > > index 35466ae..92d5f58 100644
> > > --- a/hw/hpet.c
> > > +++ b/hw/hpet.c
> > > @@ -32,6 +32,7 @@
> > >  #include "sysbus.h"
> > >  #include "mc146818rtc.h"
> > >  #include "sysemu.h"
> > > +#include <assert.h>
> > >  
> > >  //#define HPET_DEBUG
> > >  #ifdef HPET_DEBUG
> > > @@ -42,6 +43,9 @@
> > >  
> > >  #define HPET_MSI_SUPPORT        0
> > >  
> > > +#define MAX_TICKS_NOT_ACCOUNTED     (uint64_t)500000000 /* 5 sec */
> > > +#define MAX_IRQ_RATE                (uint32_t)10
> > > +
> > >  struct HPETState;
> > >  typedef struct HPETTimer {  /* timers */
> > >      uint8_t tn;             /*timer number*/
> > > @@ -326,28 +330,63 @@ static const VMStateDescription vmstate_hpet = {
> > >      }
> > >  };
> > >  
> > > +static bool hpet_timer_has_tick_backlog(HPETTimer *t)
> > > +{
> > > +    uint64_t backlog = t->ticks_not_accounted - (t->period + t->prev_period);
> > > +    return (backlog >= t->period);
> > > +}
> > > +
> > >  /*
> > >   * timer expiration callback
> > >   */
> > >  static void hpet_timer(void *opaque)
> > >  {
> > >      HPETTimer *t = opaque;
> > > +    HPETState *s = t->state;
> > >      uint64_t diff;
> > > -
> > > +    int irq_delivered = 0;
> > > +    uint32_t irq_count = 0;
> > >      uint64_t period = t->period;
> > >      uint64_t cur_tick = hpet_get_ticks(t->state);
> > >  
> > > +    if (s->driftfix && !t->ticks_not_accounted) {
> > > +        t->ticks_not_accounted = t->prev_period = t->period;
> > > +    }
> > >      if (timer_is_periodic(t) && period != 0) {
> > >          if (t->config & HPET_TN_32BIT) {
> > >              while (hpet_time_after(cur_tick, t->cmp)) {
> > >                  t->cmp = (uint32_t)(t->cmp + t->period);
> > > +                t->ticks_not_accounted += t->period;
> > > +                irq_count++;
> > >              }
> > >          } else {
> > >              while (hpet_time_after64(cur_tick, t->cmp)) {
> > >                  t->cmp += period;
> > > +                t->ticks_not_accounted += period;
> > > +                irq_count++;
> > >              }
> > >          }
> > >          diff = hpet_calculate_diff(t, cur_tick);
> > > +        if (s->driftfix) {
> > > +            if (t->ticks_not_accounted > MAX_TICKS_NOT_ACCOUNTED) {
> > > +                t->ticks_not_accounted = t->period + t->prev_period;
> > > +            }
> > > +            if (hpet_timer_has_tick_backlog(t)) {
> > > +                if (t->irq_rate == 1 || irq_count > 1) {
> > > +                    t->irq_rate++;
> > > +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> > > +                }
> > > +                if (t->divisor == 0) {
> > > +                    assert(irq_count);
> > > +                }
> > > +                if (irq_count) {
> > > +                    t->divisor = t->irq_rate;
> > > +                }
> > > +                diff /= t->divisor--;
> > > +            } else {
> > > +                t->irq_rate = 1;
> > > +            }
> > > +        }
> > >          qemu_mod_timer(t->qemu_timer,
> > >                         qemu_get_clock_ns(vm_clock) + (int64_t)ticks_to_ns(diff));
> > >      } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
> > > @@ -358,7 +397,22 @@ static void hpet_timer(void *opaque)
> > >              t->wrap_flag = 0;
> > >          }
> > >      }
> > > -    update_irq(t, 1);
> > > +    if (s->driftfix && timer_is_periodic(t) && period != 0) {
> > > +        if (t->ticks_not_accounted >= t->period + t->prev_period) {
				             ^^^^^^^^^^

> > > +            irq_delivered = update_irq(t, 1);
> > > +            if (irq_delivered) {
> > > +                t->ticks_not_accounted -= t->prev_period;
> > > +                t->prev_period = t->period;
> > > +            } else {
> > > +                if (irq_count) {
> > > +                    t->irq_rate++;
> > > +                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
> > > +                }
> > > +            }
> > > +        }
> > > +    } else {
> > > +        update_irq(t, 1);
> > > +    }
> > >  }
> > 
> > Hi Ulrich,
> > 
> > Whats prev_period for, since in practice the period will not change
> > between interrupts (OS programs comparator once, or perhaps twice during
> > bootup) ?
> 
> Actually, some systems, like Windows 2000-and-something change this
> period quite heavily under multimedia workloads.

I see. Still, using the period at the previous timer handler instance in
the decision whether to inject an interrupt to the guest makes no sense
to me.

> > Other than that, shouldnt reset accounting variables to init state on
> > write to GLOBAL_ENABLE_CFG / writes to main counter?

What i meant to ask here is whether the reinjection state (including
ticks_not_accounted) should be reset whenever a different period is
programmed.
Ulrich Obergfell - May 4, 2011, 8:06 a.m.
Hi Marcelo,
 
> Whats prev_period for, since in practice the period will not change
> between interrupts (OS programs comparator once, or perhaps twice
> during bootup) ?

'prev_period' is needed if a guest o/s changes the comparator period
'on the fly' (without stopping and restarting the timer).


             guest o/s changes period
               |
  ti(n-1)      |        ti(n)                          ti(n+1)
    |          v          |                              |
    +---------------------+------------------------------+

     <--- prev_period ---> <---------- period ---------->


The idea is that each timer interrupt represents a certain quantum
of time (the comparator period). If a guest o/s changes the period
between timer interrupt 'n-1' and timer interrupt 'n', I think the
new value should not take effect before timer interrupt 'n'. Timer
interrupt 'n' still represents the old/previous quantum, and timer
interrupt 'n+1' represents the new quantum.

Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
and sets 'prev_period' to 'period' when an interrupt was delivered
to the guest o/s.

+            irq_delivered = update_irq(t, 1);
+            if (irq_delivered) {
+                t->ticks_not_accounted -= t->prev_period;
+                t->prev_period = t->period;
+            } else {

Most of the time 'prev_period' is equal to 'period'. It should only
be different in the scenario shown above.


Regards,

Uli
Marcelo Tosatti - May 4, 2011, 9:09 a.m.
On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> 
> Hi Marcelo,
>  
> > Whats prev_period for, since in practice the period will not change
> > between interrupts (OS programs comparator once, or perhaps twice
> > during bootup) ?
> 
> 'prev_period' is needed if a guest o/s changes the comparator period
> 'on the fly' (without stopping and restarting the timer).
> 
> 
>              guest o/s changes period
>                |
>   ti(n-1)      |        ti(n)                          ti(n+1)
>     |          v          |                              |
>     +---------------------+------------------------------+
> 
>      <--- prev_period ---> <---------- period ---------->
> 
> 
> The idea is that each timer interrupt represents a certain quantum
> of time (the comparator period). If a guest o/s changes the period
> between timer interrupt 'n-1' and timer interrupt 'n', I think the
> new value should not take effect before timer interrupt 'n'. Timer
> interrupt 'n' still represents the old/previous quantum, and timer
> interrupt 'n+1' represents the new quantum.
> 
> Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> and sets 'prev_period' to 'period' when an interrupt was delivered
> to the guest o/s.
> 
> +            irq_delivered = update_irq(t, 1);
> +            if (irq_delivered) {
> +                t->ticks_not_accounted -= t->prev_period;
> +                t->prev_period = t->period;
> +            } else {
> 
> Most of the time 'prev_period' is equal to 'period'. It should only
> be different in the scenario shown above.

OK, makes sense. You should probably reset ticks_not_accounted to zero
on HPET initialization (for example, to avoid miscalibration when
kexec'ing a new kernel).
Glauber Costa - May 4, 2011, 1:36 p.m.
On Wed, 2011-05-04 at 06:09 -0300, Marcelo Tosatti wrote:
> On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> > 
> > Hi Marcelo,
> >  
> > > Whats prev_period for, since in practice the period will not change
> > > between interrupts (OS programs comparator once, or perhaps twice
> > > during bootup) ?
> > 
> > 'prev_period' is needed if a guest o/s changes the comparator period
> > 'on the fly' (without stopping and restarting the timer).
> > 
> > 
> >              guest o/s changes period
> >                |
> >   ti(n-1)      |        ti(n)                          ti(n+1)
> >     |          v          |                              |
> >     +---------------------+------------------------------+
> > 
> >      <--- prev_period ---> <---------- period ---------->
> > 
> > 
> > The idea is that each timer interrupt represents a certain quantum
> > of time (the comparator period). If a guest o/s changes the period
> > between timer interrupt 'n-1' and timer interrupt 'n', I think the
> > new value should not take effect before timer interrupt 'n'. Timer
> > interrupt 'n' still represents the old/previous quantum, and timer
> > interrupt 'n+1' represents the new quantum.
> > 
> > Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> > and sets 'prev_period' to 'period' when an interrupt was delivered
> > to the guest o/s.
> > 
> > +            irq_delivered = update_irq(t, 1);
> > +            if (irq_delivered) {
> > +                t->ticks_not_accounted -= t->prev_period;
> > +                t->prev_period = t->period;
> > +            } else {
> > 
> > Most of the time 'prev_period' is equal to 'period'. It should only
> > be different in the scenario shown above.
> 
> OK, makes sense. You should probably reset ticks_not_accounted to zero
> on HPET initialization (for example, to avoid miscalibration when
> kexec'ing a new kernel).

Everybody resetting the machine in anyway is expected to force devices
to be reinitialized, right ?
I may be wrong, but I was under the impression that kexec would do this
as well. In this case, the reset function should be enough.
Gleb Natapov - May 4, 2011, 1:46 p.m.
On Wed, May 04, 2011 at 10:36:12AM -0300, Glauber Costa wrote:
> On Wed, 2011-05-04 at 06:09 -0300, Marcelo Tosatti wrote:
> > On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> > > 
> > > Hi Marcelo,
> > >  
> > > > Whats prev_period for, since in practice the period will not change
> > > > between interrupts (OS programs comparator once, or perhaps twice
> > > > during bootup) ?
> > > 
> > > 'prev_period' is needed if a guest o/s changes the comparator period
> > > 'on the fly' (without stopping and restarting the timer).
> > > 
> > > 
> > >              guest o/s changes period
> > >                |
> > >   ti(n-1)      |        ti(n)                          ti(n+1)
> > >     |          v          |                              |
> > >     +---------------------+------------------------------+
> > > 
> > >      <--- prev_period ---> <---------- period ---------->
> > > 
> > > 
> > > The idea is that each timer interrupt represents a certain quantum
> > > of time (the comparator period). If a guest o/s changes the period
> > > between timer interrupt 'n-1' and timer interrupt 'n', I think the
> > > new value should not take effect before timer interrupt 'n'. Timer
> > > interrupt 'n' still represents the old/previous quantum, and timer
> > > interrupt 'n+1' represents the new quantum.
> > > 
> > > Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> > > and sets 'prev_period' to 'period' when an interrupt was delivered
> > > to the guest o/s.
> > > 
> > > +            irq_delivered = update_irq(t, 1);
> > > +            if (irq_delivered) {
> > > +                t->ticks_not_accounted -= t->prev_period;
> > > +                t->prev_period = t->period;
> > > +            } else {
> > > 
> > > Most of the time 'prev_period' is equal to 'period'. It should only
> > > be different in the scenario shown above.
> > 
> > OK, makes sense. You should probably reset ticks_not_accounted to zero
> > on HPET initialization (for example, to avoid miscalibration when
> > kexec'ing a new kernel).
> 
> Everybody resetting the machine in anyway is expected to force devices
> to be reinitialized, right ?
> I may be wrong, but I was under the impression that kexec would do this
> as well. In this case, the reset function should be enough.
> 
kexec does not reset a machine. That's the whole point of kexec in
fact.

--
			Gleb.
Glauber Costa - May 4, 2011, 1:47 p.m.
On Wed, 2011-05-04 at 16:46 +0300, Gleb Natapov wrote:
> On Wed, May 04, 2011 at 10:36:12AM -0300, Glauber Costa wrote:
> > On Wed, 2011-05-04 at 06:09 -0300, Marcelo Tosatti wrote:
> > > On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> > > > 
> > > > Hi Marcelo,
> > > >  
> > > > > Whats prev_period for, since in practice the period will not change
> > > > > between interrupts (OS programs comparator once, or perhaps twice
> > > > > during bootup) ?
> > > > 
> > > > 'prev_period' is needed if a guest o/s changes the comparator period
> > > > 'on the fly' (without stopping and restarting the timer).
> > > > 
> > > > 
> > > >              guest o/s changes period
> > > >                |
> > > >   ti(n-1)      |        ti(n)                          ti(n+1)
> > > >     |          v          |                              |
> > > >     +---------------------+------------------------------+
> > > > 
> > > >      <--- prev_period ---> <---------- period ---------->
> > > > 
> > > > 
> > > > The idea is that each timer interrupt represents a certain quantum
> > > > of time (the comparator period). If a guest o/s changes the period
> > > > between timer interrupt 'n-1' and timer interrupt 'n', I think the
> > > > new value should not take effect before timer interrupt 'n'. Timer
> > > > interrupt 'n' still represents the old/previous quantum, and timer
> > > > interrupt 'n+1' represents the new quantum.
> > > > 
> > > > Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> > > > and sets 'prev_period' to 'period' when an interrupt was delivered
> > > > to the guest o/s.
> > > > 
> > > > +            irq_delivered = update_irq(t, 1);
> > > > +            if (irq_delivered) {
> > > > +                t->ticks_not_accounted -= t->prev_period;
> > > > +                t->prev_period = t->period;
> > > > +            } else {
> > > > 
> > > > Most of the time 'prev_period' is equal to 'period'. It should only
> > > > be different in the scenario shown above.
> > > 
> > > OK, makes sense. You should probably reset ticks_not_accounted to zero
> > > on HPET initialization (for example, to avoid miscalibration when
> > > kexec'ing a new kernel).
> > 
> > Everybody resetting the machine in anyway is expected to force devices
> > to be reinitialized, right ?
> > I may be wrong, but I was under the impression that kexec would do this
> > as well. In this case, the reset function should be enough.
> > 
> kexec does not reset a machine. That's the whole point of kexec in
> fact.
Sure thing, but doesn't it force the initialization routine of the devices themselves, without
going through the bios ?

> --
> 			Gleb.
Gleb Natapov - May 4, 2011, 1:55 p.m.
On Wed, May 04, 2011 at 10:47:40AM -0300, Glauber Costa wrote:
> On Wed, 2011-05-04 at 16:46 +0300, Gleb Natapov wrote:
> > On Wed, May 04, 2011 at 10:36:12AM -0300, Glauber Costa wrote:
> > > On Wed, 2011-05-04 at 06:09 -0300, Marcelo Tosatti wrote:
> > > > On Wed, May 04, 2011 at 04:06:59AM -0400, Ulrich Obergfell wrote:
> > > > > 
> > > > > Hi Marcelo,
> > > > >  
> > > > > > Whats prev_period for, since in practice the period will not change
> > > > > > between interrupts (OS programs comparator once, or perhaps twice
> > > > > > during bootup) ?
> > > > > 
> > > > > 'prev_period' is needed if a guest o/s changes the comparator period
> > > > > 'on the fly' (without stopping and restarting the timer).
> > > > > 
> > > > > 
> > > > >              guest o/s changes period
> > > > >                |
> > > > >   ti(n-1)      |        ti(n)                          ti(n+1)
> > > > >     |          v          |                              |
> > > > >     +---------------------+------------------------------+
> > > > > 
> > > > >      <--- prev_period ---> <---------- period ---------->
> > > > > 
> > > > > 
> > > > > The idea is that each timer interrupt represents a certain quantum
> > > > > of time (the comparator period). If a guest o/s changes the period
> > > > > between timer interrupt 'n-1' and timer interrupt 'n', I think the
> > > > > new value should not take effect before timer interrupt 'n'. Timer
> > > > > interrupt 'n' still represents the old/previous quantum, and timer
> > > > > interrupt 'n+1' represents the new quantum.
> > > > > 
> > > > > Hence, the patch decrements 'ticks_not_accounted' by 'prev_period'
> > > > > and sets 'prev_period' to 'period' when an interrupt was delivered
> > > > > to the guest o/s.
> > > > > 
> > > > > +            irq_delivered = update_irq(t, 1);
> > > > > +            if (irq_delivered) {
> > > > > +                t->ticks_not_accounted -= t->prev_period;
> > > > > +                t->prev_period = t->period;
> > > > > +            } else {
> > > > > 
> > > > > Most of the time 'prev_period' is equal to 'period'. It should only
> > > > > be different in the scenario shown above.
> > > > 
> > > > OK, makes sense. You should probably reset ticks_not_accounted to zero
> > > > on HPET initialization (for example, to avoid miscalibration when
> > > > kexec'ing a new kernel).
> > > 
> > > Everybody resetting the machine in anyway is expected to force devices
> > > to be reinitialized, right ?
> > > I may be wrong, but I was under the impression that kexec would do this
> > > as well. In this case, the reset function should be enough.
> > > 
> > kexec does not reset a machine. That's the whole point of kexec in
> > fact.
> Sure thing, but doesn't it force the initialization routine of the devices themselves, without
> going through the bios ?
> 
It just starts new kernel. New kernel's init runs as usual and
re-initialize everything. No it doesn't go through the BIOS. What for?
Actually I happily use kexec on IBM blade server where BIOS run takes no
less then 5 minutes and kexec reboots instantly.

--
			Gleb.
Ulrich Obergfell - May 5, 2011, 8:07 a.m.
Hi Marcelo,
 
> Other than that, shouldnt reset accounting variables to init state on
> write to GLOBAL_ENABLE_CFG / writes to main counter?

I'd suggest to initialize/reset the driftfix-related fields in the
'HPETTimer' structure (including the backlog of unaccounted ticks)
in the following situations.


- When the guest o/s sets the 'CFG_ENABLE' bit (overall enable) in
  the General Configuration Register.

  This is the place in hpet_ram_writel():

        case HPET_CFG:
            ...
            if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
                ...
                for (i = 0; i < s->num_timers; i++) {
                    if ((&s->timer[i])->cmp != ~0ULL) {
                        // initialize / reset fields here
                        hpet_set_timer(&s->timer[i]);


- When the guest o/s sets the 'TN_ENABLE' bit (timer N interrupt
  enable) in the Timer N Configuration and Capabilities Register.

  This is the place in hpet_ram_writel():

        case HPET_TN_CFG:
            ...
            if (activating_bit(old_val, new_val, HPET_TN_ENABLE)) {
                // initialize / reset fields here
                hpet_set_timer(timer);


This should cover cases such as ...

- first time initialization of HPET & timers during guest o/s boot.
  (includes kexec and reboot)

- if a guest o/s stops and restarts the timer.
  (for example, to change the comparator register value or
   to switch a timer between periodic mode and non-periodic mode)


Regards,

Uli
Marcelo Tosatti - May 6, 2011, 2:39 p.m.
On Thu, May 05, 2011 at 04:07:19AM -0400, Ulrich Obergfell wrote:
> 
> Hi Marcelo,
>  
> > Other than that, shouldnt reset accounting variables to init state on
> > write to GLOBAL_ENABLE_CFG / writes to main counter?
> 
> I'd suggest to initialize/reset the driftfix-related fields in the
> 'HPETTimer' structure (including the backlog of unaccounted ticks)
> in the following situations.
> 
> 
> - When the guest o/s sets the 'CFG_ENABLE' bit (overall enable) in
>   the General Configuration Register.
> 
>   This is the place in hpet_ram_writel():
> 
>         case HPET_CFG:
>             ...
>             if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
>                 ...
>                 for (i = 0; i < s->num_timers; i++) {
>                     if ((&s->timer[i])->cmp != ~0ULL) {
>                         // initialize / reset fields here
>                         hpet_set_timer(&s->timer[i]);
> 
> 
> - When the guest o/s sets the 'TN_ENABLE' bit (timer N interrupt
>   enable) in the Timer N Configuration and Capabilities Register.
> 
>   This is the place in hpet_ram_writel():
> 
>         case HPET_TN_CFG:
>             ...
>             if (activating_bit(old_val, new_val, HPET_TN_ENABLE)) {
>                 // initialize / reset fields here
>                 hpet_set_timer(timer);
> 
> 
> This should cover cases such as ...
> 
> - first time initialization of HPET & timers during guest o/s boot.
>   (includes kexec and reboot)
> 
> - if a guest o/s stops and restarts the timer.
>   (for example, to change the comparator register value or
>    to switch a timer between periodic mode and non-periodic mode)
> 
> 
> Regards,
> 
> Uli

Uli, looks good.

Patch

diff --git a/hw/hpet.c b/hw/hpet.c
index 35466ae..92d5f58 100644
--- a/hw/hpet.c
+++ b/hw/hpet.c
@@ -32,6 +32,7 @@ 
 #include "sysbus.h"
 #include "mc146818rtc.h"
 #include "sysemu.h"
+#include <assert.h>
 
 //#define HPET_DEBUG
 #ifdef HPET_DEBUG
@@ -42,6 +43,9 @@ 
 
 #define HPET_MSI_SUPPORT        0
 
+#define MAX_TICKS_NOT_ACCOUNTED     (uint64_t)500000000 /* 5 sec */
+#define MAX_IRQ_RATE                (uint32_t)10
+
 struct HPETState;
 typedef struct HPETTimer {  /* timers */
     uint8_t tn;             /*timer number*/
@@ -326,28 +330,63 @@  static const VMStateDescription vmstate_hpet = {
     }
 };
 
+static bool hpet_timer_has_tick_backlog(HPETTimer *t)
+{
+    uint64_t backlog = t->ticks_not_accounted - (t->period + t->prev_period);
+    return (backlog >= t->period);
+}
+
 /*
  * timer expiration callback
  */
 static void hpet_timer(void *opaque)
 {
     HPETTimer *t = opaque;
+    HPETState *s = t->state;
     uint64_t diff;
-
+    int irq_delivered = 0;
+    uint32_t irq_count = 0;
     uint64_t period = t->period;
     uint64_t cur_tick = hpet_get_ticks(t->state);
 
+    if (s->driftfix && !t->ticks_not_accounted) {
+        t->ticks_not_accounted = t->prev_period = t->period;
+    }
     if (timer_is_periodic(t) && period != 0) {
         if (t->config & HPET_TN_32BIT) {
             while (hpet_time_after(cur_tick, t->cmp)) {
                 t->cmp = (uint32_t)(t->cmp + t->period);
+                t->ticks_not_accounted += t->period;
+                irq_count++;
             }
         } else {
             while (hpet_time_after64(cur_tick, t->cmp)) {
                 t->cmp += period;
+                t->ticks_not_accounted += period;
+                irq_count++;
             }
         }
         diff = hpet_calculate_diff(t, cur_tick);
+        if (s->driftfix) {
+            if (t->ticks_not_accounted > MAX_TICKS_NOT_ACCOUNTED) {
+                t->ticks_not_accounted = t->period + t->prev_period;
+            }
+            if (hpet_timer_has_tick_backlog(t)) {
+                if (t->irq_rate == 1 || irq_count > 1) {
+                    t->irq_rate++;
+                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
+                }
+                if (t->divisor == 0) {
+                    assert(irq_count);
+                }
+                if (irq_count) {
+                    t->divisor = t->irq_rate;
+                }
+                diff /= t->divisor--;
+            } else {
+                t->irq_rate = 1;
+            }
+        }
         qemu_mod_timer(t->qemu_timer,
                        qemu_get_clock_ns(vm_clock) + (int64_t)ticks_to_ns(diff));
     } else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
@@ -358,7 +397,22 @@  static void hpet_timer(void *opaque)
             t->wrap_flag = 0;
         }
     }
-    update_irq(t, 1);
+    if (s->driftfix && timer_is_periodic(t) && period != 0) {
+        if (t->ticks_not_accounted >= t->period + t->prev_period) {
+            irq_delivered = update_irq(t, 1);
+            if (irq_delivered) {
+                t->ticks_not_accounted -= t->prev_period;
+                t->prev_period = t->period;
+            } else {
+                if (irq_count) {
+                    t->irq_rate++;
+                    t->irq_rate = MIN(t->irq_rate, MAX_IRQ_RATE);
+                }
+            }
+        }
+    } else {
+        update_irq(t, 1);
+    }
 }
 
 static void hpet_set_timer(HPETTimer *t)
@@ -697,6 +751,11 @@  static void hpet_reset(DeviceState *d)
         timer->config |=  0x00000004ULL << 32;
         timer->period = 0ULL;
         timer->wrap_flag = 0;
+
+        timer->prev_period = 0;
+        timer->ticks_not_accounted = 0;
+        timer->irq_rate = 1;
+        timer->divisor = 1;
     }
 
     s->hpet_counter = 0ULL;