diff mbox series

target/ppc: only save guest timebase once after stopping

Message ID 20180504042044.10318-1-mdroth@linux.vnet.ibm.com
State New
Headers show
Series target/ppc: only save guest timebase once after stopping | expand

Commit Message

Michael Roth May 4, 2018, 4:20 a.m. UTC
In some cases (e.g. spapr) we record guest timebase after qmp_stop()
via a runstate hook so we can restore it on qmp_cont(). If a migration
occurs in between those events we end up saving it again, this time
based on the current timebase the guest would be seeing had it been
running. This has the effect of advancing the guest timebase while
it is stopped, which is not what the code intends.

Other than simple jumps in time, this has been seen to trigger what
appear to be RCU-related crashes in recent kernels when the advance
exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
common operations such as `virsh migrate ... --timeout 60`.

Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Laurent Vivier <lvivier@redhat.com>
Cc: qemu-ppc@nongnu.org
Cc: qemu-stable@nongnu.org
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
 hw/ppc/ppc.c         | 12 ++++++++++++
 target/ppc/cpu-qom.h |  1 +
 2 files changed, 13 insertions(+)

Comments

Greg Kurz May 4, 2018, 9:37 a.m. UTC | #1
On Thu,  3 May 2018 23:20:44 -0500
Michael Roth <mdroth@linux.vnet.ibm.com> wrote:

> In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> via a runstate hook so we can restore it on qmp_cont(). If a migration
> occurs in between those events we end up saving it again, this time
> based on the current timebase the guest would be seeing had it been
> running. This has the effect of advancing the guest timebase while
> it is stopped, which is not what the code intends.
> 

Hi Mike,

The current behavior was introduced by:

commit 42043e4f1241eeb77f87f5816b5cf0b6e9583ed7
Author: Laurent Vivier <lvivier@redhat.com>
Date:   Fri Jan 27 13:24:58 2017 +0100

    spapr: clock should count only if vm is running

and we have this in the changelog:

    We keep timebase_pre_save to reduce the clock difference on
    migration like in:
        6053a86 kvmclock: reduce kvmclock difference on migration


So your patch totally negates ^^ ? Also, I can't see a case where
timebase_save() could be called from vmstate_save_state() while the
VM is running, ie, you could drop timebase_pre_save()... or am I
*probably* missing something ?

> Other than simple jumps in time, this has been seen to trigger what
> appear to be RCU-related crashes in recent kernels when the advance
> exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
> common operations such as `virsh migrate ... --timeout 60`.
> 
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Cc: Laurent Vivier <lvivier@redhat.com>
> Cc: qemu-ppc@nongnu.org
> Cc: qemu-stable@nongnu.org
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
>  hw/ppc/ppc.c         | 12 ++++++++++++
>  target/ppc/cpu-qom.h |  1 +
>  2 files changed, 13 insertions(+)
> 
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index ec4be25f49..ff0a107864 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -865,6 +865,15 @@ static void timebase_save(PPCTimebase *tb)
>      uint64_t ticks = cpu_get_host_ticks();
>      PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>  
> +    /* since we generally save timebase just after the guest
> +     * has stopped, avoid trying to save it again since we will
> +     * end up advancing it by the amount of ticks that have
> +     * elapsed in the host since the initial save
> +     */
> +    if (tb->saved) {
> +        return;
> +    }
> +
>      if (!first_ppc_cpu->env.tb_env) {
>          error_report("No timebase object");
>          return;
> @@ -877,6 +886,7 @@ static void timebase_save(PPCTimebase *tb)
>       * there is no need to update it from KVM here
>       */
>      tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
> +    tb->saved = true;
>  }
>  
>  static void timebase_load(PPCTimebase *tb)
> @@ -908,6 +918,8 @@ static void timebase_load(PPCTimebase *tb)
>                          &pcpu->env.tb_env->tb_offset);
>  #endif
>      }
> +
> +    tb->saved = false;
>  }
>  
>  void cpu_ppc_clock_vm_state_change(void *opaque, int running,
> diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
> index deaa46a14b..ec2dbcdcae 100644
> --- a/target/ppc/cpu-qom.h
> +++ b/target/ppc/cpu-qom.h
> @@ -210,6 +210,7 @@ typedef struct PowerPCCPUClass {
>  typedef struct PPCTimebase {
>      uint64_t guest_timebase;
>      int64_t time_of_the_day_ns;
> +    bool saved;
>  } PPCTimebase;
>  
>  extern const struct VMStateDescription vmstate_ppc_timebase;
Michael Roth May 4, 2018, 12:18 p.m. UTC | #2
Quoting Greg Kurz (2018-05-04 04:37:24)
> On Thu,  3 May 2018 23:20:44 -0500
> Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> 
> > In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> > via a runstate hook so we can restore it on qmp_cont(). If a migration
> > occurs in between those events we end up saving it again, this time
> > based on the current timebase the guest would be seeing had it been
> > running. This has the effect of advancing the guest timebase while
> > it is stopped, which is not what the code intends.
> > 
> 
> Hi Mike,
> 
> The current behavior was introduced by:
> 
> commit 42043e4f1241eeb77f87f5816b5cf0b6e9583ed7
> Author: Laurent Vivier <lvivier@redhat.com>
> Date:   Fri Jan 27 13:24:58 2017 +0100
> 
>     spapr: clock should count only if vm is running
> 
> and we have this in the changelog:
> 
>     We keep timebase_pre_save to reduce the clock difference on
>     migration like in:
>         6053a86 kvmclock: reduce kvmclock difference on migration
> 
> 
> So your patch totally negates ^^ ? Also, I can't see a case where

Yah... this is a bit confusing. On one hand, the patch/summary is clearly
trying to avoid the guest time from advancing while it is stopped, which
is in the spirit of this patch. But at the same time it is trying to
compensate for loss of time (relative to host) due to downtime window.

I think the subtlety is in the amount of time... saving at pre_save
rather than vm_stop() compensates for the normal downtime window, which
is *usually* small (5s is the figure they quote in the notes there and
in the motivating 6053a86 "kvmclock: reduce kvmclock difference on
migration"). The delays between vm_stop and vm_cont via something like
virsh suspend/resume is unbounded, unhowever, hence the rationale for
the runstate hook (?).

So maybe small jumps are considered okay, and large ones not? If that's
the reasoning, then this patch is addressing the later, so it's not
necessarily in conflict with that motivation, but the implementation
does negate the small jumps we try to avoid via pre_save hook since
we'll end up keep the version we saved just after vm_stop instead.

I would note that the downtime window itself, while usually small, can
also be quite large. With 1GB hugepages we've seen some guests requiring
downtime windows to be set to 25s until QEMU would start cut-over. Also
rcu_cpu_stall_timeout is configurable...it's possible if we set it to
5s it could trigger on the jump the guest experiences from pre_save (I
haven't tested that though).

Maybe trying to compensate for downtime is a generally bad idea and we
should just leave it up to NTP/etc? Or maybe we should choose a specific
upper bound on how much migration downtime we're willing to compensate
for and enforce that directly? E.g. tb->saved becomes tb->saved_time and
we check the difference in pre_save before calling timebase_save()
again.

> So your patch totally negates ^^ ? Also, I can't see a case where
> timebase_save() could be called from vmstate_save_state() while the
> VM is running, ie, you could drop timebase_pre_save()... or am I
> *probably* missing something ?

Yah, I didn't notice that my patch completely negated the pre_save
hook... for some reason I was thinking that would continue to function
normally if we didn't call qmp_stop() explicitly but that's clearly not
the case. So yah, dropping timebase_pre_save() is essentially what my
patch is doing...

> 
> > Other than simple jumps in time, this has been seen to trigger what
> > appear to be RCU-related crashes in recent kernels when the advance
> > exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
> > common operations such as `virsh migrate ... --timeout 60`.
> > 
> > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > Cc: David Gibson <david@gibson.dropbear.id.au>
> > Cc: Laurent Vivier <lvivier@redhat.com>
> > Cc: qemu-ppc@nongnu.org
> > Cc: qemu-stable@nongnu.org
> > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > ---
> >  hw/ppc/ppc.c         | 12 ++++++++++++
> >  target/ppc/cpu-qom.h |  1 +
> >  2 files changed, 13 insertions(+)
> > 
> > diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> > index ec4be25f49..ff0a107864 100644
> > --- a/hw/ppc/ppc.c
> > +++ b/hw/ppc/ppc.c
> > @@ -865,6 +865,15 @@ static void timebase_save(PPCTimebase *tb)
> >      uint64_t ticks = cpu_get_host_ticks();
> >      PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
> >  
> > +    /* since we generally save timebase just after the guest
> > +     * has stopped, avoid trying to save it again since we will
> > +     * end up advancing it by the amount of ticks that have
> > +     * elapsed in the host since the initial save
> > +     */
> > +    if (tb->saved) {
> > +        return;
> > +    }
> > +
> >      if (!first_ppc_cpu->env.tb_env) {
> >          error_report("No timebase object");
> >          return;
> > @@ -877,6 +886,7 @@ static void timebase_save(PPCTimebase *tb)
> >       * there is no need to update it from KVM here
> >       */
> >      tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
> > +    tb->saved = true;
> >  }
> >  
> >  static void timebase_load(PPCTimebase *tb)
> > @@ -908,6 +918,8 @@ static void timebase_load(PPCTimebase *tb)
> >                          &pcpu->env.tb_env->tb_offset);
> >  #endif
> >      }
> > +
> > +    tb->saved = false;
> >  }
> >  
> >  void cpu_ppc_clock_vm_state_change(void *opaque, int running,
> > diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
> > index deaa46a14b..ec2dbcdcae 100644
> > --- a/target/ppc/cpu-qom.h
> > +++ b/target/ppc/cpu-qom.h
> > @@ -210,6 +210,7 @@ typedef struct PowerPCCPUClass {
> >  typedef struct PPCTimebase {
> >      uint64_t guest_timebase;
> >      int64_t time_of_the_day_ns;
> > +    bool saved;
> >  } PPCTimebase;
> >  
> >  extern const struct VMStateDescription vmstate_ppc_timebase;
>
Greg Kurz May 4, 2018, 1:50 p.m. UTC | #3
On Fri, 04 May 2018 07:18:13 -0500
Michael Roth <mdroth@linux.vnet.ibm.com> wrote:

> Quoting Greg Kurz (2018-05-04 04:37:24)
> > On Thu,  3 May 2018 23:20:44 -0500
> > Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> >   
> > > In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> > > via a runstate hook so we can restore it on qmp_cont(). If a migration
> > > occurs in between those events we end up saving it again, this time
> > > based on the current timebase the guest would be seeing had it been
> > > running. This has the effect of advancing the guest timebase while
> > > it is stopped, which is not what the code intends.
> > >   
> > 
> > Hi Mike,
> > 
> > The current behavior was introduced by:
> > 
> > commit 42043e4f1241eeb77f87f5816b5cf0b6e9583ed7
> > Author: Laurent Vivier <lvivier@redhat.com>
> > Date:   Fri Jan 27 13:24:58 2017 +0100
> > 
> >     spapr: clock should count only if vm is running
> > 
> > and we have this in the changelog:
> > 
> >     We keep timebase_pre_save to reduce the clock difference on
> >     migration like in:
> >         6053a86 kvmclock: reduce kvmclock difference on migration
> > 
> > 
> > So your patch totally negates ^^ ? Also, I can't see a case where  
> 
> Yah... this is a bit confusing. On one hand, the patch/summary is clearly
> trying to avoid the guest time from advancing while it is stopped, which
> is in the spirit of this patch. But at the same time it is trying to
> compensate for loss of time (relative to host) due to downtime window.
> 

Yeah... not sure why Laurent decided to address both in the same patch...
maybe just because we already had the pre_save hook ?

> I think the subtlety is in the amount of time... saving at pre_save
> rather than vm_stop() compensates for the normal downtime window, which
> is *usually* small (5s is the figure they quote in the notes there and
> in the motivating 6053a86 "kvmclock: reduce kvmclock difference on
> migration"). The delays between vm_stop and vm_cont via something like
> virsh suspend/resume is unbounded, unhowever, hence the rationale for
> the runstate hook (?).
> 

That's my understanding as well.

> So maybe small jumps are considered okay, and large ones not? If that's
> the reasoning, then this patch is addressing the later, so it's not
> necessarily in conflict with that motivation, but the implementation
> does negate the small jumps we try to avoid via pre_save hook since
> we'll end up keep the version we saved just after vm_stop instead.
> 
> I would note that the downtime window itself, while usually small, can
> also be quite large. With 1GB hugepages we've seen some guests requiring
> downtime windows to be set to 25s until QEMU would start cut-over. Also
> rcu_cpu_stall_timeout is configurable...it's possible if we set it to
> 5s it could trigger on the jump the guest experiences from pre_save (I
> haven't tested that though).
> 
> Maybe trying to compensate for downtime is a generally bad idea and we
> should just leave it up to NTP/etc? 

My understanding of NTP is that it isn't designed to cope with sudden
time differences, which is exactly what happens in our case.

> Or maybe we should choose a specific
> upper bound on how much migration downtime we're willing to compensate
> for and enforce that directly? E.g. tb->saved becomes tb->saved_time and
> we check the difference in pre_save before calling timebase_save()
> again.
> 

This would maybe allow to reach a compromise between the current code
and your patch... but it would still be difficult to come up with
a sensible value for this upper bound, wouldn't it ?

> > So your patch totally negates ^^ ? Also, I can't see a case where
> > timebase_save() could be called from vmstate_save_state() while the
> > VM is running, ie, you could drop timebase_pre_save()... or am I
> > *probably* missing something ?  
> 
> Yah, I didn't notice that my patch completely negated the pre_save
> hook... for some reason I was thinking that would continue to function
> normally if we didn't call qmp_stop() explicitly but that's clearly not
> the case. So yah, dropping timebase_pre_save() is essentially what my
> patch is doing...
> 

<thinking aloud>
How does Linux cope with standard software suspend or hibernate ? It also
causes a downtime and it doesn't generate RCU stalls AFAIK... would it
be possible/make sense for migration to look like an hibernate ?
</thinking aloud>

> >   
> > > Other than simple jumps in time, this has been seen to trigger what
> > > appear to be RCU-related crashes in recent kernels when the advance
> > > exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
> > > common operations such as `virsh migrate ... --timeout 60`.
> > > 
> > > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > Cc: David Gibson <david@gibson.dropbear.id.au>
> > > Cc: Laurent Vivier <lvivier@redhat.com>
> > > Cc: qemu-ppc@nongnu.org
> > > Cc: qemu-stable@nongnu.org
> > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > > ---
> > >  hw/ppc/ppc.c         | 12 ++++++++++++
> > >  target/ppc/cpu-qom.h |  1 +
> > >  2 files changed, 13 insertions(+)
> > > 
> > > diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> > > index ec4be25f49..ff0a107864 100644
> > > --- a/hw/ppc/ppc.c
> > > +++ b/hw/ppc/ppc.c
> > > @@ -865,6 +865,15 @@ static void timebase_save(PPCTimebase *tb)
> > >      uint64_t ticks = cpu_get_host_ticks();
> > >      PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
> > >  
> > > +    /* since we generally save timebase just after the guest
> > > +     * has stopped, avoid trying to save it again since we will
> > > +     * end up advancing it by the amount of ticks that have
> > > +     * elapsed in the host since the initial save
> > > +     */
> > > +    if (tb->saved) {
> > > +        return;
> > > +    }
> > > +
> > >      if (!first_ppc_cpu->env.tb_env) {
> > >          error_report("No timebase object");
> > >          return;
> > > @@ -877,6 +886,7 @@ static void timebase_save(PPCTimebase *tb)
> > >       * there is no need to update it from KVM here
> > >       */
> > >      tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
> > > +    tb->saved = true;
> > >  }
> > >  
> > >  static void timebase_load(PPCTimebase *tb)
> > > @@ -908,6 +918,8 @@ static void timebase_load(PPCTimebase *tb)
> > >                          &pcpu->env.tb_env->tb_offset);
> > >  #endif
> > >      }
> > > +
> > > +    tb->saved = false;
> > >  }
> > >  
> > >  void cpu_ppc_clock_vm_state_change(void *opaque, int running,
> > > diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
> > > index deaa46a14b..ec2dbcdcae 100644
> > > --- a/target/ppc/cpu-qom.h
> > > +++ b/target/ppc/cpu-qom.h
> > > @@ -210,6 +210,7 @@ typedef struct PowerPCCPUClass {
> > >  typedef struct PPCTimebase {
> > >      uint64_t guest_timebase;
> > >      int64_t time_of_the_day_ns;
> > > +    bool saved;
> > >  } PPCTimebase;
> > >  
> > >  extern const struct VMStateDescription vmstate_ppc_timebase;  
> >   
>
Laurent Vivier May 4, 2018, 3:59 p.m. UTC | #4
On 04/05/2018 15:50, Greg Kurz wrote:
> On Fri, 04 May 2018 07:18:13 -0500
> Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> 
>> Quoting Greg Kurz (2018-05-04 04:37:24)
>>> On Thu,  3 May 2018 23:20:44 -0500
>>> Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
>>>   
>>>> In some cases (e.g. spapr) we record guest timebase after qmp_stop()
>>>> via a runstate hook so we can restore it on qmp_cont(). If a migration
>>>> occurs in between those events we end up saving it again, this time
>>>> based on the current timebase the guest would be seeing had it been
>>>> running. This has the effect of advancing the guest timebase while
>>>> it is stopped, which is not what the code intends.
>>>>   
>>>
>>> Hi Mike,
>>>
>>> The current behavior was introduced by:
>>>
>>> commit 42043e4f1241eeb77f87f5816b5cf0b6e9583ed7
>>> Author: Laurent Vivier <lvivier@redhat.com>
>>> Date:   Fri Jan 27 13:24:58 2017 +0100
>>>
>>>     spapr: clock should count only if vm is running
>>>
>>> and we have this in the changelog:
>>>
>>>     We keep timebase_pre_save to reduce the clock difference on
>>>     migration like in:
>>>         6053a86 kvmclock: reduce kvmclock difference on migration
>>>
>>>
>>> So your patch totally negates ^^ ? Also, I can't see a case where  
>>
>> Yah... this is a bit confusing. On one hand, the patch/summary is clearly
>> trying to avoid the guest time from advancing while it is stopped, which
>> is in the spirit of this patch. But at the same time it is trying to
>> compensate for loss of time (relative to host) due to downtime window.
>>
> 
> Yeah... not sure why Laurent decided to address both in the same patch...
> maybe just because we already had the pre_save hook ?

Well, it seemed to be a good idea to do like it is done for x86 [1]

But I think you should remove the timebase_pre_save() function (and the
comment) instead of adding a new flag.

Thanks,
Laurent

[1] the idea was proposed by Paolo:
https://lists.gnu.org/archive/html/qemu-devel/2017-01/msg05862.html
David Gibson May 5, 2018, 4:20 a.m. UTC | #5
On Fri, May 04, 2018 at 07:18:13AM -0500, Michael Roth wrote:
> Quoting Greg Kurz (2018-05-04 04:37:24)
> > On Thu,  3 May 2018 23:20:44 -0500
> > Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> > 
> > > In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> > > via a runstate hook so we can restore it on qmp_cont(). If a migration
> > > occurs in between those events we end up saving it again, this time
> > > based on the current timebase the guest would be seeing had it been
> > > running. This has the effect of advancing the guest timebase while
> > > it is stopped, which is not what the code intends.
> > > 
> > 
> > Hi Mike,
> > 
> > The current behavior was introduced by:
> > 
> > commit 42043e4f1241eeb77f87f5816b5cf0b6e9583ed7
> > Author: Laurent Vivier <lvivier@redhat.com>
> > Date:   Fri Jan 27 13:24:58 2017 +0100
> > 
> >     spapr: clock should count only if vm is running
> > 
> > and we have this in the changelog:
> > 
> >     We keep timebase_pre_save to reduce the clock difference on
> >     migration like in:
> >         6053a86 kvmclock: reduce kvmclock difference on migration
> > 
> > 
> > So your patch totally negates ^^ ? Also, I can't see a case where
> 
> Yah... this is a bit confusing. On one hand, the patch/summary is clearly
> trying to avoid the guest time from advancing while it is stopped, which
> is in the spirit of this patch. But at the same time it is trying to
> compensate for loss of time (relative to host) due to downtime window.
> 
> I think the subtlety is in the amount of time... saving at pre_save
> rather than vm_stop() compensates for the normal downtime window, which
> is *usually* small (5s is the figure they quote in the notes there and
> in the motivating 6053a86 "kvmclock: reduce kvmclock difference on
> migration"). The delays between vm_stop and vm_cont via something like
> virsh suspend/resume is unbounded, unhowever, hence the rationale for
> the runstate hook (?).

> So maybe small jumps are considered okay, and large ones not? If that's
> the reasoning, then this patch is addressing the later, so it's not
> necessarily in conflict with that motivation, but the implementation
> does negate the small jumps we try to avoid via pre_save hook since
> we'll end up keep the version we saved just after vm_stop instead.

So, the size of the jump is an important difference in practice, but I
don't think it's what we should base our decisions on.  Rather, we
should be basing the decision on the *reason* for the jump.

If it's due to an explicit user requested stoppage, then we should
stop the clock.  The user has basically asked for the VM to be put
into statis, and that won't really work if we don't stop the clock.

If the stoppage is due to an internal detail for what, from the user's
point of view, is supposed to be a still-running VM (i.e. a
migration), then the clock should continue.  Otherwise it will be
wrong, afterwards.

> I would note that the downtime window itself, while usually small, can
> also be quite large. With 1GB hugepages we've seen some guests requiring
> downtime windows to be set to 25s until QEMU would start cut-over. Also
> rcu_cpu_stall_timeout is configurable...it's possible if we set it to
> 5s it could trigger on the jump the guest experiences from pre_save (I
> haven't tested that though).

> Maybe trying to compensate for downtime is a generally bad idea and we
> should just leave it up to NTP/etc? Or maybe we should choose a specific
> upper bound on how much migration downtime we're willing to compensate
> for and enforce that directly? E.g. tb->saved becomes tb->saved_time and
> we check the difference in pre_save before calling timebase_save()
> again.

Yes, downtimes can sometimes be long.  I still think it's correct to
keep the clock going in that case.  The guest may give warnings
because it's seeing something funny with the clock.  Something *is*
funny with the clock, and those warnings are correct.  Basically, when
the downtime is that long we can't really maintain the illusion of a
continuously running VM.  Pretending we can by fudging the clocks is
not doing our users a service

> > So your patch totally negates ^^ ? Also, I can't see a case where
> > timebase_save() could be called from vmstate_save_state() while the
> > VM is running, ie, you could drop timebase_pre_save()... or am I
> > *probably* missing something ?
> 
> Yah, I didn't notice that my patch completely negated the pre_save
> hook... for some reason I was thinking that would continue to function
> normally if we didn't call qmp_stop() explicitly but that's clearly not
> the case. So yah, dropping timebase_pre_save() is essentially what my
> patch is doing...
> 
> > 
> > > Other than simple jumps in time, this has been seen to trigger what
> > > appear to be RCU-related crashes in recent kernels when the advance
> > > exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
> > > common operations such as `virsh migrate ... --timeout 60`.
> > > 
> > > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > Cc: David Gibson <david@gibson.dropbear.id.au>
> > > Cc: Laurent Vivier <lvivier@redhat.com>
> > > Cc: qemu-ppc@nongnu.org
> > > Cc: qemu-stable@nongnu.org
> > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > > ---
> > >  hw/ppc/ppc.c         | 12 ++++++++++++
> > >  target/ppc/cpu-qom.h |  1 +
> > >  2 files changed, 13 insertions(+)
> > > 
> > > diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> > > index ec4be25f49..ff0a107864 100644
> > > --- a/hw/ppc/ppc.c
> > > +++ b/hw/ppc/ppc.c
> > > @@ -865,6 +865,15 @@ static void timebase_save(PPCTimebase *tb)
> > >      uint64_t ticks = cpu_get_host_ticks();
> > >      PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
> > >  
> > > +    /* since we generally save timebase just after the guest
> > > +     * has stopped, avoid trying to save it again since we will
> > > +     * end up advancing it by the amount of ticks that have
> > > +     * elapsed in the host since the initial save
> > > +     */
> > > +    if (tb->saved) {
> > > +        return;
> > > +    }
> > > +
> > >      if (!first_ppc_cpu->env.tb_env) {
> > >          error_report("No timebase object");
> > >          return;
> > > @@ -877,6 +886,7 @@ static void timebase_save(PPCTimebase *tb)
> > >       * there is no need to update it from KVM here
> > >       */
> > >      tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
> > > +    tb->saved = true;
> > >  }
> > >  
> > >  static void timebase_load(PPCTimebase *tb)
> > > @@ -908,6 +918,8 @@ static void timebase_load(PPCTimebase *tb)
> > >                          &pcpu->env.tb_env->tb_offset);
> > >  #endif
> > >      }
> > > +
> > > +    tb->saved = false;
> > >  }
> > >  
> > >  void cpu_ppc_clock_vm_state_change(void *opaque, int running,
> > > diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
> > > index deaa46a14b..ec2dbcdcae 100644
> > > --- a/target/ppc/cpu-qom.h
> > > +++ b/target/ppc/cpu-qom.h
> > > @@ -210,6 +210,7 @@ typedef struct PowerPCCPUClass {
> > >  typedef struct PPCTimebase {
> > >      uint64_t guest_timebase;
> > >      int64_t time_of_the_day_ns;
> > > +    bool saved;
> > >  } PPCTimebase;
> > >  
> > >  extern const struct VMStateDescription vmstate_ppc_timebase;
> > 
>
David Gibson May 5, 2018, 4:23 a.m. UTC | #6
On Fri, May 04, 2018 at 03:50:28PM +0200, Greg Kurz wrote:
> On Fri, 04 May 2018 07:18:13 -0500
> Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> 
> > Quoting Greg Kurz (2018-05-04 04:37:24)
> > > On Thu,  3 May 2018 23:20:44 -0500
> > > Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> > >   
> > > > In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> > > > via a runstate hook so we can restore it on qmp_cont(). If a migration
> > > > occurs in between those events we end up saving it again, this time
> > > > based on the current timebase the guest would be seeing had it been
> > > > running. This has the effect of advancing the guest timebase while
> > > > it is stopped, which is not what the code intends.
> > > >   
> > > 
> > > Hi Mike,
> > > 
> > > The current behavior was introduced by:
> > > 
> > > commit 42043e4f1241eeb77f87f5816b5cf0b6e9583ed7
> > > Author: Laurent Vivier <lvivier@redhat.com>
> > > Date:   Fri Jan 27 13:24:58 2017 +0100
> > > 
> > >     spapr: clock should count only if vm is running
> > > 
> > > and we have this in the changelog:
> > > 
> > >     We keep timebase_pre_save to reduce the clock difference on
> > >     migration like in:
> > >         6053a86 kvmclock: reduce kvmclock difference on migration
> > > 
> > > 
> > > So your patch totally negates ^^ ? Also, I can't see a case where  
> > 
> > Yah... this is a bit confusing. On one hand, the patch/summary is clearly
> > trying to avoid the guest time from advancing while it is stopped, which
> > is in the spirit of this patch. But at the same time it is trying to
> > compensate for loss of time (relative to host) due to downtime window.
> > 
> 
> Yeah... not sure why Laurent decided to address both in the same patch...
> maybe just because we already had the pre_save hook ?
> 
> > I think the subtlety is in the amount of time... saving at pre_save
> > rather than vm_stop() compensates for the normal downtime window, which
> > is *usually* small (5s is the figure they quote in the notes there and
> > in the motivating 6053a86 "kvmclock: reduce kvmclock difference on
> > migration"). The delays between vm_stop and vm_cont via something like
> > virsh suspend/resume is unbounded, unhowever, hence the rationale for
> > the runstate hook (?).
> > 
> 
> That's my understanding as well.
> 
> > So maybe small jumps are considered okay, and large ones not? If that's
> > the reasoning, then this patch is addressing the later, so it's not
> > necessarily in conflict with that motivation, but the implementation
> > does negate the small jumps we try to avoid via pre_save hook since
> > we'll end up keep the version we saved just after vm_stop instead.
> > 
> > I would note that the downtime window itself, while usually small, can
> > also be quite large. With 1GB hugepages we've seen some guests requiring
> > downtime windows to be set to 25s until QEMU would start cut-over. Also
> > rcu_cpu_stall_timeout is configurable...it's possible if we set it to
> > 5s it could trigger on the jump the guest experiences from pre_save (I
> > haven't tested that though).
> > 
> > Maybe trying to compensate for downtime is a generally bad idea and we
> > should just leave it up to NTP/etc? 
> 
> My understanding of NTP is that it isn't designed to cope with sudden
> time differences, which is exactly what happens in our case.

I think so too.  I mean it will correct things right eventually,
unless the jump is *really* huge, but it's not a good enough solution
that we shouldn't at least try to keep the guest's wall-time clock
correct across the migration.

> > Or maybe we should choose a specific
> > upper bound on how much migration downtime we're willing to compensate
> > for and enforce that directly? E.g. tb->saved becomes tb->saved_time and
> > we check the difference in pre_save before calling timebase_save()
> > again.
> > 
> 
> This would maybe allow to reach a compromise between the current code
> and your patch... but it would still be difficult to come up with
> a sensible value for this upper bound, wouldn't it ?
> 
> > > So your patch totally negates ^^ ? Also, I can't see a case where
> > > timebase_save() could be called from vmstate_save_state() while the
> > > VM is running, ie, you could drop timebase_pre_save()... or am I
> > > *probably* missing something ?  
> > 
> > Yah, I didn't notice that my patch completely negated the pre_save
> > hook... for some reason I was thinking that would continue to function
> > normally if we didn't call qmp_stop() explicitly but that's clearly not
> > the case. So yah, dropping timebase_pre_save() is essentially what my
> > patch is doing...
> > 
> 
> <thinking aloud>
> How does Linux cope with standard software suspend or hibernate ? It also
> causes a downtime and it doesn't generate RCU stalls AFAIK... would it
> be possible/make sense for migration to look like an hibernate ?
> </thinking aloud>

In that case the guest is aware of the stoppage, and knows to a)
correct its internal clock afterwards (usually by looking at the RTC),
and b) not see the sudden clock jump as an error.

By design, guests are not aware of a migration, so they can't do that.
David Gibson July 26, 2018, 5:07 a.m. UTC | #7
On Thu, May 03, 2018 at 11:20:44PM -0500, Michael Roth wrote:
> In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> via a runstate hook so we can restore it on qmp_cont(). If a migration
> occurs in between those events we end up saving it again, this time
> based on the current timebase the guest would be seeing had it been
> running. This has the effect of advancing the guest timebase while
> it is stopped, which is not what the code intends.
> 
> Other than simple jumps in time, this has been seen to trigger what
> appear to be RCU-related crashes in recent kernels when the advance
> exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
> common operations such as `virsh migrate ... --timeout 60`.
> 
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Cc: Laurent Vivier <lvivier@redhat.com>
> Cc: qemu-ppc@nongnu.org
> Cc: qemu-stable@nongnu.org
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>

Sorry, this fell off my radar for ages, but I've finally had a chance
to look at it properly.

I'm not totally convinced this handle all the possible edge cases
correctly, but I am convinced it gives behaviour that's more correct
than we have now.  It doesn't introduce changes to the interface or
migration stream that would break things in future, so I've applied it
to ppc-for-3.1.

> ---
>  hw/ppc/ppc.c         | 12 ++++++++++++
>  target/ppc/cpu-qom.h |  1 +
>  2 files changed, 13 insertions(+)
> 
> diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> index ec4be25f49..ff0a107864 100644
> --- a/hw/ppc/ppc.c
> +++ b/hw/ppc/ppc.c
> @@ -865,6 +865,15 @@ static void timebase_save(PPCTimebase *tb)
>      uint64_t ticks = cpu_get_host_ticks();
>      PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
>  
> +    /* since we generally save timebase just after the guest
> +     * has stopped, avoid trying to save it again since we will
> +     * end up advancing it by the amount of ticks that have
> +     * elapsed in the host since the initial save
> +     */
> +    if (tb->saved) {
> +        return;
> +    }
> +
>      if (!first_ppc_cpu->env.tb_env) {
>          error_report("No timebase object");
>          return;
> @@ -877,6 +886,7 @@ static void timebase_save(PPCTimebase *tb)
>       * there is no need to update it from KVM here
>       */
>      tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
> +    tb->saved = true;
>  }
>  
>  static void timebase_load(PPCTimebase *tb)
> @@ -908,6 +918,8 @@ static void timebase_load(PPCTimebase *tb)
>                          &pcpu->env.tb_env->tb_offset);
>  #endif
>      }
> +
> +    tb->saved = false;
>  }
>  
>  void cpu_ppc_clock_vm_state_change(void *opaque, int running,
> diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
> index deaa46a14b..ec2dbcdcae 100644
> --- a/target/ppc/cpu-qom.h
> +++ b/target/ppc/cpu-qom.h
> @@ -210,6 +210,7 @@ typedef struct PowerPCCPUClass {
>  typedef struct PPCTimebase {
>      uint64_t guest_timebase;
>      int64_t time_of_the_day_ns;
> +    bool saved;
>  } PPCTimebase;
>  
>  extern const struct VMStateDescription vmstate_ppc_timebase;
Laurent Vivier July 26, 2018, 7:44 a.m. UTC | #8
On 26/07/2018 07:07, David Gibson wrote:
> On Thu, May 03, 2018 at 11:20:44PM -0500, Michael Roth wrote:
>> In some cases (e.g. spapr) we record guest timebase after qmp_stop()
>> via a runstate hook so we can restore it on qmp_cont(). If a migration
>> occurs in between those events we end up saving it again, this time
>> based on the current timebase the guest would be seeing had it been
>> running. This has the effect of advancing the guest timebase while
>> it is stopped, which is not what the code intends.
>>
>> Other than simple jumps in time, this has been seen to trigger what
>> appear to be RCU-related crashes in recent kernels when the advance
>> exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
>> common operations such as `virsh migrate ... --timeout 60`.
>>
>> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Cc: Laurent Vivier <lvivier@redhat.com>
>> Cc: qemu-ppc@nongnu.org
>> Cc: qemu-stable@nongnu.org
>> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> 
> Sorry, this fell off my radar for ages, but I've finally had a chance
> to look at it properly.
> 
> I'm not totally convinced this handle all the possible edge cases
> correctly, but I am convinced it gives behaviour that's more correct
> than we have now.  It doesn't introduce changes to the interface or
> migration stream that would break things in future, so I've applied it
> to ppc-for-3.1.

See https://bugzilla.redhat.com/show_bug.cgi?id=1571230#c8

So you revert:

commit 42043e4f1241eeb77f87f5816b5cf0b6e9583ed7
Author: Laurent Vivier <lvivier@redhat.com>
Date:   Fri Jan 27 13:24:58 2017 +0100

    spapr: clock should count only if vm is running

and change the behaviour compared to x86_64

     6053a86 kvmclock: reduce kvmclock difference on migration

Is this what you want?

Thanks,
Laurent
Michael Roth July 26, 2018, 12:30 p.m. UTC | #9
Quoting David Gibson (2018-07-26 00:07:46)
> On Thu, May 03, 2018 at 11:20:44PM -0500, Michael Roth wrote:
> > In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> > via a runstate hook so we can restore it on qmp_cont(). If a migration
> > occurs in between those events we end up saving it again, this time
> > based on the current timebase the guest would be seeing had it been
> > running. This has the effect of advancing the guest timebase while
> > it is stopped, which is not what the code intends.
> > 
> > Other than simple jumps in time, this has been seen to trigger what
> > appear to be RCU-related crashes in recent kernels when the advance
> > exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
> > common operations such as `virsh migrate ... --timeout 60`.
> > 
> > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > Cc: David Gibson <david@gibson.dropbear.id.au>
> > Cc: Laurent Vivier <lvivier@redhat.com>
> > Cc: qemu-ppc@nongnu.org
> > Cc: qemu-stable@nongnu.org
> > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> 
> Sorry, this fell off my radar for ages, but I've finally had a chance
> to look at it properly.

No problem, I had assumed it just wasn't deemed needed based on the
discussion.

> 
> I'm not totally convinced this handle all the possible edge cases
> correctly, but I am convinced it gives behaviour that's more correct
> than we have now.  It doesn't introduce changes to the interface or
> migration stream that would break things in future, so I've applied it
> to ppc-for-3.1.

There's a flaw with the patch that Greg pointed out to me previously
in that it doesn't just avoid the presave hook when vm_stop is issued
manually via monitor, but also when vm_stop is issued by the migration
code itself. The latter wasn't intentional. I'm not sure if we currently
have a good way to distinguish between the 2 cases to rectify that.

As the patch currently stands it is effectively a revert of the
Laurent's original patch.

FWIW, the RCU-related crashes mentioned in the original commit turned
out to be a separate issue that was exacerbated by RCU stall warnings
messages (which *would* be less likely with this patch). So this would
be more about user experience and spurious log messages than an actual
known bug fix. I am still of the opinion that we shouldn't resave the
clock if the vm_stop was manually-issued; it's just that the current
patch oversteps that goal.

> 
> > ---
> >  hw/ppc/ppc.c         | 12 ++++++++++++
> >  target/ppc/cpu-qom.h |  1 +
> >  2 files changed, 13 insertions(+)
> > 
> > diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
> > index ec4be25f49..ff0a107864 100644
> > --- a/hw/ppc/ppc.c
> > +++ b/hw/ppc/ppc.c
> > @@ -865,6 +865,15 @@ static void timebase_save(PPCTimebase *tb)
> >      uint64_t ticks = cpu_get_host_ticks();
> >      PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
> >  
> > +    /* since we generally save timebase just after the guest
> > +     * has stopped, avoid trying to save it again since we will
> > +     * end up advancing it by the amount of ticks that have
> > +     * elapsed in the host since the initial save
> > +     */
> > +    if (tb->saved) {
> > +        return;
> > +    }
> > +
> >      if (!first_ppc_cpu->env.tb_env) {
> >          error_report("No timebase object");
> >          return;
> > @@ -877,6 +886,7 @@ static void timebase_save(PPCTimebase *tb)
> >       * there is no need to update it from KVM here
> >       */
> >      tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
> > +    tb->saved = true;
> >  }
> >  
> >  static void timebase_load(PPCTimebase *tb)
> > @@ -908,6 +918,8 @@ static void timebase_load(PPCTimebase *tb)
> >                          &pcpu->env.tb_env->tb_offset);
> >  #endif
> >      }
> > +
> > +    tb->saved = false;
> >  }
> >  
> >  void cpu_ppc_clock_vm_state_change(void *opaque, int running,
> > diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
> > index deaa46a14b..ec2dbcdcae 100644
> > --- a/target/ppc/cpu-qom.h
> > +++ b/target/ppc/cpu-qom.h
> > @@ -210,6 +210,7 @@ typedef struct PowerPCCPUClass {
> >  typedef struct PPCTimebase {
> >      uint64_t guest_timebase;
> >      int64_t time_of_the_day_ns;
> > +    bool saved;
> >  } PPCTimebase;
> >  
> >  extern const struct VMStateDescription vmstate_ppc_timebase;
> 
> -- 
> David Gibson                    | I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
>                                 | _way_ _around_!
> http://www.ozlabs.org/~dgibson
David Gibson July 27, 2018, 1:09 a.m. UTC | #10
On Thu, Jul 26, 2018 at 09:44:11AM +0200, Laurent Vivier wrote:
> On 26/07/2018 07:07, David Gibson wrote:
> > On Thu, May 03, 2018 at 11:20:44PM -0500, Michael Roth wrote:
> >> In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> >> via a runstate hook so we can restore it on qmp_cont(). If a migration
> >> occurs in between those events we end up saving it again, this time
> >> based on the current timebase the guest would be seeing had it been
> >> running. This has the effect of advancing the guest timebase while
> >> it is stopped, which is not what the code intends.
> >>
> >> Other than simple jumps in time, this has been seen to trigger what
> >> appear to be RCU-related crashes in recent kernels when the advance
> >> exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
> >> common operations such as `virsh migrate ... --timeout 60`.
> >>
> >> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> Cc: David Gibson <david@gibson.dropbear.id.au>
> >> Cc: Laurent Vivier <lvivier@redhat.com>
> >> Cc: qemu-ppc@nongnu.org
> >> Cc: qemu-stable@nongnu.org
> >> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > 
> > Sorry, this fell off my radar for ages, but I've finally had a chance
> > to look at it properly.
> > 
> > I'm not totally convinced this handle all the possible edge cases
> > correctly, but I am convinced it gives behaviour that's more correct
> > than we have now.  It doesn't introduce changes to the interface or
> > migration stream that would break things in future, so I've applied it
> > to ppc-for-3.1.
> 
> See https://bugzilla.redhat.com/show_bug.cgi?id=1571230#c8
> 
> So you revert:
> 
> commit 42043e4f1241eeb77f87f5816b5cf0b6e9583ed7
> Author: Laurent Vivier <lvivier@redhat.com>
> Date:   Fri Jan 27 13:24:58 2017 +0100
> 
>     spapr: clock should count only if vm is running
> 
> and change the behaviour compared to x86_64
> 
>      6053a86 kvmclock: reduce kvmclock difference on migration
> 
> Is this what you want?

Ah, no, not really.  I've pulled it out of 3.1 again.
David Gibson July 27, 2018, 1:10 a.m. UTC | #11
On Thu, Jul 26, 2018 at 07:30:34AM -0500, Michael Roth wrote:
> Quoting David Gibson (2018-07-26 00:07:46)
> > On Thu, May 03, 2018 at 11:20:44PM -0500, Michael Roth wrote:
> > > In some cases (e.g. spapr) we record guest timebase after qmp_stop()
> > > via a runstate hook so we can restore it on qmp_cont(). If a migration
> > > occurs in between those events we end up saving it again, this time
> > > based on the current timebase the guest would be seeing had it been
> > > running. This has the effect of advancing the guest timebase while
> > > it is stopped, which is not what the code intends.
> > > 
> > > Other than simple jumps in time, this has been seen to trigger what
> > > appear to be RCU-related crashes in recent kernels when the advance
> > > exceeds rcu_cpu_stall_timeout, and it can be triggered by fairly
> > > common operations such as `virsh migrate ... --timeout 60`.
> > > 
> > > Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> > > Cc: David Gibson <david@gibson.dropbear.id.au>
> > > Cc: Laurent Vivier <lvivier@redhat.com>
> > > Cc: qemu-ppc@nongnu.org
> > > Cc: qemu-stable@nongnu.org
> > > Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> > 
> > Sorry, this fell off my radar for ages, but I've finally had a chance
> > to look at it properly.
> 
> No problem, I had assumed it just wasn't deemed needed based on the
> discussion.
> 
> > 
> > I'm not totally convinced this handle all the possible edge cases
> > correctly, but I am convinced it gives behaviour that's more correct
> > than we have now.  It doesn't introduce changes to the interface or
> > migration stream that would break things in future, so I've applied it
> > to ppc-for-3.1.
> 
> There's a flaw with the patch that Greg pointed out to me previously
> in that it doesn't just avoid the presave hook when vm_stop is issued
> manually via monitor, but also when vm_stop is issued by the migration
> code itself. The latter wasn't intentional. I'm not sure if we currently
> have a good way to distinguish between the 2 cases to rectify that.
> 
> As the patch currently stands it is effectively a revert of the
> Laurent's original patch.
> 
> FWIW, the RCU-related crashes mentioned in the original commit turned
> out to be a separate issue that was exacerbated by RCU stall warnings
> messages (which *would* be less likely with this patch). So this would
> be more about user experience and spurious log messages than an actual
> known bug fix. I am still of the opinion that we shouldn't resave the
> clock if the vm_stop was manually-issued; it's just that the current
> patch oversteps that goal.

Right, I agree.  Migrating shouldn't advance the time if we've already
explicitly stopped.  But it's not really clear how to accomplish that
:/.
Mark Cave-Ayland July 27, 2018, 12:35 p.m. UTC | #12
On 27/07/18 02:10, David Gibson wrote:

> Right, I agree.  Migrating shouldn't advance the time if we've already
> explicitly stopped.  But it's not really clear how to accomplish that
> :/.

This topic is obviously of interest to me because it relates to various 
discussions in the past relating to migrating guest timebases under TCG, 
so below is my current understanding of what we are trying to achieve.

Using the following assumptions:

   i) Sending host and receiving host have the same TB freq
  ii) Sending host and receiving host have synchronised clocks

If we ignore any RTC issues for the moment (can we assume that the guest 
will resync itself via NTP or similar on restart?) then it is possible 
to model 2 separate scenarios:


1) Guest is stopped

In this case there is no timebase compensation required on the receiving 
host. When restarted the guest will see the same timebase as at the 
point in time in which it was stopped.

Sender:
   When guest is stopped:
     guestTB = sendHostTB + sendHostTBDiff

Receiver:
   When guest moves to running state:
     recvTBDiff = guestTB - recvHostTB


2) Guest is running

In this case the duration of the migration itself must be considered 
when the guest starts executing on the receiver. Since the guest is 
running the resulting guestTB is compensated for the time spent during 
the migration phase itself.

Sender:
   guestTB = sendHostTB + sendHostTBDiff
   sendhostCLK = qemu_clock_get_ns(QEMU_CLOCK_REALTIME)

Receiver:
   When guest moves to running state:
     recvhostCLK = qemu_clock_get_ns(QEMU_CLOCK_REALTIME)
     migrationTime = recvhostCLK - sendhostCLK
     migrationTBDiff = migrationTime / recvHostTBFreq
     recvTBDiff = guestTB + migrationTBDiff - recvHostTB


The VM running state should already be in the migration stream which 
enables the receiving host to determine between scenarios 1 and 2, and 
in fact they are equivalent if migrationTime is considered to be 0 in 
scenario 1.

There is also the issue as to whether to allow migration between hosts 
with different timebases; it should be possible to calculate the correct 
recvTBDiff if sendTBFreq were included in the migration stream, but 
could the guest OS detect this and change its behaviour accordingly?

Perhaps a solution here is to always have a separate field representing 
the guestTB the last time the VM was stopped (stoppedGuestTB?) set via a 
VM state change handler and include it in the migration stream?

The receiver knows whether the incoming guest is running or not, and so 
can choose whether to use stoppedGuestTB (set at the last VM stop) or 
guestTB (set during timebase_save) for its timebase calculations when 
resuming the guest as required?



ATB,

Mark.
diff mbox series

Patch

diff --git a/hw/ppc/ppc.c b/hw/ppc/ppc.c
index ec4be25f49..ff0a107864 100644
--- a/hw/ppc/ppc.c
+++ b/hw/ppc/ppc.c
@@ -865,6 +865,15 @@  static void timebase_save(PPCTimebase *tb)
     uint64_t ticks = cpu_get_host_ticks();
     PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu);
 
+    /* since we generally save timebase just after the guest
+     * has stopped, avoid trying to save it again since we will
+     * end up advancing it by the amount of ticks that have
+     * elapsed in the host since the initial save
+     */
+    if (tb->saved) {
+        return;
+    }
+
     if (!first_ppc_cpu->env.tb_env) {
         error_report("No timebase object");
         return;
@@ -877,6 +886,7 @@  static void timebase_save(PPCTimebase *tb)
      * there is no need to update it from KVM here
      */
     tb->guest_timebase = ticks + first_ppc_cpu->env.tb_env->tb_offset;
+    tb->saved = true;
 }
 
 static void timebase_load(PPCTimebase *tb)
@@ -908,6 +918,8 @@  static void timebase_load(PPCTimebase *tb)
                         &pcpu->env.tb_env->tb_offset);
 #endif
     }
+
+    tb->saved = false;
 }
 
 void cpu_ppc_clock_vm_state_change(void *opaque, int running,
diff --git a/target/ppc/cpu-qom.h b/target/ppc/cpu-qom.h
index deaa46a14b..ec2dbcdcae 100644
--- a/target/ppc/cpu-qom.h
+++ b/target/ppc/cpu-qom.h
@@ -210,6 +210,7 @@  typedef struct PowerPCCPUClass {
 typedef struct PPCTimebase {
     uint64_t guest_timebase;
     int64_t time_of_the_day_ns;
+    bool saved;
 } PPCTimebase;
 
 extern const struct VMStateDescription vmstate_ppc_timebase;