diff mbox

[QEMU,v2] kvmclock: advance clock by time window between vm_stop and pre_save

Message ID 20161104165933.GA3027@amt.cnet
State New
Headers show

Commit Message

Marcelo Tosatti Nov. 4, 2016, 4:59 p.m. UTC
This patch, relative to pre-copy migration codepath,
measures the time between vm_stop() and pre_save(),
which includes copying the remaining RAM to destination,
and advances the clock by that amount.

In a VM with 5 seconds downtime, this reduces the guest
clock difference on destination from 5s to 0.2s.

Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---

v2: use subsection (Juan Quintela)
    fix older machine types support

Comments

Juan Quintela Nov. 4, 2016, 6:57 p.m. UTC | #1
Marcelo Tosatti <mtosatti@redhat.com> wrote:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(),
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
>
> In a VM with 5 seconds downtime, this reduces the guest
> clock difference on destination from 5s to 0.2s.
>
> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> ---
>
> v2: use subsection (Juan Quintela)
>     fix older machine types support
>

Reviewed-by: Juan Quintela <quintela@redhat.com>
Dr. David Alan Gilbert Nov. 7, 2016, 3:46 p.m. UTC | #2
* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(),
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
> 
> In a VM with 5 seconds downtime, this reduces the guest
> clock difference on destination from 5s to 0.2s.
> 
> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.

One thing that bothers me is that it's only this clock that's
getting corrected; doesn't it cause things to get upset when
one clock moves and the others dont?
Shouldn't the pause delay be recorded somewhere architecturally
independent and then be a thing that kvm-clock happens to use and
other clocks might as well?

Dave

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
> 
> v2: use subsection (Juan Quintela)
>     fix older machine types support
> 
> diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> index 0f75dd3..a2a02ac 100644
> --- a/hw/i386/kvm/clock.c
> +++ b/hw/i386/kvm/clock.c
> @@ -22,9 +22,11 @@
>  #include "kvm_i386.h"
>  #include "hw/sysbus.h"
>  #include "hw/kvm/clock.h"
> +#include "migration/migration.h"
>  
>  #include <linux/kvm.h>
>  #include <linux/kvm_para.h>
> +#include <time.h>
>  
>  #define TYPE_KVM_CLOCK "kvmclock"
>  #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
> @@ -35,7 +37,13 @@ typedef struct KVMClockState {
>      /*< public >*/
>  
>      uint64_t clock;
> +    uint64_t ns;
>      bool clock_valid;
> +
> +    uint64_t advance_clock;
> +    struct timespec t_aftervmstop;
> +
> +    bool adv_clock_enabled;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -100,6 +108,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              s->clock = time_at_migration;
>          }
>  
> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> +            s->clock += s->advance_clock;
> +            s->advance_clock = 0;
> +        }
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -135,6 +148,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              abort();
>          }
>          s->clock = data.clock;
> +        /*
> +         * Transition from VM-running to VM-stopped via migration?
> +         * Record when the VM was stopped.
> +         */
> +
> +        if (state == RUN_STATE_FINISH_MIGRATE &&
> +            !migration_in_postcopy(migrate_get_current())) {
> +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> +        } else {
> +            s->t_aftervmstop.tv_sec = 0;
> +            s->t_aftervmstop.tv_nsec = 0;
> +        }
>  
>          /*
>           * If the VM is stopped, declare the clock state valid to
> @@ -152,6 +177,77 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> +{
> +    if (before->tv_sec > after->tv_sec ||
> +        (before->tv_sec == after->tv_sec &&
> +         before->tv_nsec > after->tv_nsec)) {
> +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> +        abort();
> +    }
> +
> +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> +            after->tv_nsec - before->tv_nsec;
> +}
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +    struct timespec now;
> +    uint64_t ns;
> +
> +    if (s->t_aftervmstop.tv_sec == 0) {
> +        return;
> +    }
> +
> +    clock_gettime(CLOCK_MONOTONIC, &now);
> +
> +    ns = clock_delta(&s->t_aftervmstop, &now);
> +
> +    /*
> +     * Linux guests can overflow if time jumps
> +     * forward in large increments.
> +     * Cap maximum adjustment to 10 minutes.
> +     */
> +    ns = MIN(ns, 600000000000ULL);
> +
> +    if (s->clock + ns > s->clock) {
> +        s->ns = ns;
> +    }
> +}
> +
> +static int kvmclock_post_load(void *opaque, int version_id)
> +{
> +    KVMClockState *s = opaque;
> +
> +    /* save the value from incoming migration */
> +    s->advance_clock = s->ns;
> +
> +    return 0;
> +}
> +
> +static bool kvmclock_ns_needed(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    return s->adv_clock_enabled;
> +}
> +
> +static const VMStateDescription kvmclock_advance_ns = {
> +    .name = "kvmclock/advance_ns",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_ns_needed,
> +    .pre_save = kvmclock_pre_save,
> +    .post_load = kvmclock_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(ns, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
> @@ -159,15 +255,25 @@ static const VMStateDescription kvmclock_vmsd = {
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_advance_ns,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("advance_clock", KVMClockState, adv_clock_enabled, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 98dc772..243352e 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -370,6 +370,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "advance_clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Marcelo Tosatti Nov. 7, 2016, 7:41 p.m. UTC | #3
On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > This patch, relative to pre-copy migration codepath,
> > measures the time between vm_stop() and pre_save(),
> > which includes copying the remaining RAM to destination,
> > and advances the clock by that amount.
> > 
> > In a VM with 5 seconds downtime, this reduces the guest
> > clock difference on destination from 5s to 0.2s.
> > 
> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> 
> One thing that bothers me is that it's only this clock that's
> getting corrected; doesn't it cause things to get upset when
> one clock moves and the others dont?

If you are correlating the clocks, then yes.

Older Linux guests get upset (marking the TSC clocksource unstable
because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
in newer guests
(kvmclock interface to notify watchdog to not complain).

Note marking TSC clocksource unstable on older guests is harmless
because kvmclock is the standard clocksource.

For Windows guests, i don't know that Windows correlates between different
clocks.

That is, there is relative control as to which software reads kvmclock 
or Windows TIMER MSR, so i don't see the need to advance every clock 
exposed.

> Shouldn't the pause delay be recorded somewhere architecturally
> independent and then be a thing that kvm-clock happens to use and
> other clocks might as well?

In theory, yes. In practice, i don't see the need for this...
Dr. David Alan Gilbert Nov. 7, 2016, 8:03 p.m. UTC | #4
* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > This patch, relative to pre-copy migration codepath,
> > > measures the time between vm_stop() and pre_save(),
> > > which includes copying the remaining RAM to destination,
> > > and advances the clock by that amount.
> > > 
> > > In a VM with 5 seconds downtime, this reduces the guest
> > > clock difference on destination from 5s to 0.2s.
> > > 
> > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > 
> > One thing that bothers me is that it's only this clock that's
> > getting corrected; doesn't it cause things to get upset when
> > one clock moves and the others dont?
> 
> If you are correlating the clocks, then yes.
> 
> Older Linux guests get upset (marking the TSC clocksource unstable
> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> in newer guests
> (kvmclock interface to notify watchdog to not complain).
> 
> Note marking TSC clocksource unstable on older guests is harmless
> because kvmclock is the standard clocksource.
> 
> For Windows guests, i don't know that Windows correlates between different
> clocks.
> 
> That is, there is relative control as to which software reads kvmclock 
> or Windows TIMER MSR, so i don't see the need to advance every clock 
> exposed.
> 
> > Shouldn't the pause delay be recorded somewhere architecturally
> > independent and then be a thing that kvm-clock happens to use and
> > other clocks might as well?
> 
> In theory, yes. In practice, i don't see the need for this... 

It seems unlikely to me that x86 is the only one that will want
to do something similar.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Marcelo Tosatti Nov. 8, 2016, 12:06 a.m. UTC | #5
On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > This patch, relative to pre-copy migration codepath,
> > > > measures the time between vm_stop() and pre_save(),
> > > > which includes copying the remaining RAM to destination,
> > > > and advances the clock by that amount.
> > > > 
> > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > clock difference on destination from 5s to 0.2s.
> > > > 
> > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > 
> > > One thing that bothers me is that it's only this clock that's
> > > getting corrected; doesn't it cause things to get upset when
> > > one clock moves and the others dont?
> > 
> > If you are correlating the clocks, then yes.
> > 
> > Older Linux guests get upset (marking the TSC clocksource unstable
> > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > in newer guests
> > (kvmclock interface to notify watchdog to not complain).
> > 
> > Note marking TSC clocksource unstable on older guests is harmless
> > because kvmclock is the standard clocksource.
> > 
> > For Windows guests, i don't know that Windows correlates between different
> > clocks.
> > 
> > That is, there is relative control as to which software reads kvmclock 
> > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > exposed.
> > 
> > > Shouldn't the pause delay be recorded somewhere architecturally
> > > independent and then be a thing that kvm-clock happens to use and
> > > other clocks might as well?
> > 
> > In theory, yes. In practice, i don't see the need for this... 
> 
> It seems unlikely to me that x86 is the only one that will want
> to do something similar.

Can't they copy what kvmclock is doing today?
Dr. David Alan Gilbert Nov. 8, 2016, 10:22 a.m. UTC | #6
* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > This patch, relative to pre-copy migration codepath,
> > > > > measures the time between vm_stop() and pre_save(),
> > > > > which includes copying the remaining RAM to destination,
> > > > > and advances the clock by that amount.
> > > > > 
> > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > clock difference on destination from 5s to 0.2s.
> > > > > 
> > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > 
> > > > One thing that bothers me is that it's only this clock that's
> > > > getting corrected; doesn't it cause things to get upset when
> > > > one clock moves and the others dont?
> > > 
> > > If you are correlating the clocks, then yes.
> > > 
> > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > in newer guests
> > > (kvmclock interface to notify watchdog to not complain).
> > > 
> > > Note marking TSC clocksource unstable on older guests is harmless
> > > because kvmclock is the standard clocksource.
> > > 
> > > For Windows guests, i don't know that Windows correlates between different
> > > clocks.
> > > 
> > > That is, there is relative control as to which software reads kvmclock 
> > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > exposed.
> > > 
> > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > independent and then be a thing that kvm-clock happens to use and
> > > > other clocks might as well?
> > > 
> > > In theory, yes. In practice, i don't see the need for this... 
> > 
> > It seems unlikely to me that x86 is the only one that will want
> > to do something similar.
> 
> Can't they copy what kvmclock is doing today? 

We shouldn't have copies of code all over should we?

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Marcelo Tosatti Nov. 8, 2016, 1:32 p.m. UTC | #7
On Tue, Nov 08, 2016 at 10:22:56AM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > > This patch, relative to pre-copy migration codepath,
> > > > > > measures the time between vm_stop() and pre_save(),
> > > > > > which includes copying the remaining RAM to destination,
> > > > > > and advances the clock by that amount.
> > > > > > 
> > > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > > clock difference on destination from 5s to 0.2s.
> > > > > > 
> > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > > 
> > > > > One thing that bothers me is that it's only this clock that's
> > > > > getting corrected; doesn't it cause things to get upset when
> > > > > one clock moves and the others dont?
> > > > 
> > > > If you are correlating the clocks, then yes.
> > > > 
> > > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > > in newer guests
> > > > (kvmclock interface to notify watchdog to not complain).
> > > > 
> > > > Note marking TSC clocksource unstable on older guests is harmless
> > > > because kvmclock is the standard clocksource.
> > > > 
> > > > For Windows guests, i don't know that Windows correlates between different
> > > > clocks.
> > > > 
> > > > That is, there is relative control as to which software reads kvmclock 
> > > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > > exposed.
> > > > 
> > > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > > independent and then be a thing that kvm-clock happens to use and
> > > > > other clocks might as well?
> > > > 
> > > > In theory, yes. In practice, i don't see the need for this... 
> > > 
> > > It seems unlikely to me that x86 is the only one that will want
> > > to do something similar.
> > 
> > Can't they copy what kvmclock is doing today? 
> 
> We shouldn't have copies of code all over should we?
> 
> Dave

Fine i'll add a notifier.
Paolo Bonzini Nov. 9, 2016, 4:23 p.m. UTC | #8
On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>> This patch, relative to pre-copy migration codepath,
>>>>>> measures the time between vm_stop() and pre_save(),
>>>>>> which includes copying the remaining RAM to destination,
>>>>>> and advances the clock by that amount.
>>>>>>
>>>>>> In a VM with 5 seconds downtime, this reduces the guest
>>>>>> clock difference on destination from 5s to 0.2s.
>>>>>>
>>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>>>>
>>>>> One thing that bothers me is that it's only this clock that's
>>>>> getting corrected; doesn't it cause things to get upset when
>>>>> one clock moves and the others dont?
>>>>
>>>> If you are correlating the clocks, then yes.
>>>>
>>>> Older Linux guests get upset (marking the TSC clocksource unstable
>>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
>>>> in newer guests
>>>> (kvmclock interface to notify watchdog to not complain).
>>>>
>>>> Note marking TSC clocksource unstable on older guests is harmless
>>>> because kvmclock is the standard clocksource.
>>>>
>>>> For Windows guests, i don't know that Windows correlates between different
>>>> clocks.
>>>>
>>>> That is, there is relative control as to which software reads kvmclock 
>>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
>>>> exposed.
>>>>
>>>>> Shouldn't the pause delay be recorded somewhere architecturally
>>>>> independent and then be a thing that kvm-clock happens to use and
>>>>> other clocks might as well?
>>>>
>>>> In theory, yes. In practice, i don't see the need for this... 
>>>
>>> It seems unlikely to me that x86 is the only one that will want
>>> to do something similar.
>>
>> Can't they copy what kvmclock is doing today? 
> 
> We shouldn't have copies of code all over should we?

Let's cross the bridge when we get there.

For now I'm more interested in having a version of the patch that is
clean and doesn't need advance_clock.

Paolo
Dr. David Alan Gilbert Nov. 9, 2016, 4:28 p.m. UTC | #9
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>>>> This patch, relative to pre-copy migration codepath,
> >>>>>> measures the time between vm_stop() and pre_save(),
> >>>>>> which includes copying the remaining RAM to destination,
> >>>>>> and advances the clock by that amount.
> >>>>>>
> >>>>>> In a VM with 5 seconds downtime, this reduces the guest
> >>>>>> clock difference on destination from 5s to 0.2s.
> >>>>>>
> >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>>>>
> >>>>> One thing that bothers me is that it's only this clock that's
> >>>>> getting corrected; doesn't it cause things to get upset when
> >>>>> one clock moves and the others dont?
> >>>>
> >>>> If you are correlating the clocks, then yes.
> >>>>
> >>>> Older Linux guests get upset (marking the TSC clocksource unstable
> >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> >>>> in newer guests
> >>>> (kvmclock interface to notify watchdog to not complain).
> >>>>
> >>>> Note marking TSC clocksource unstable on older guests is harmless
> >>>> because kvmclock is the standard clocksource.
> >>>>
> >>>> For Windows guests, i don't know that Windows correlates between different
> >>>> clocks.
> >>>>
> >>>> That is, there is relative control as to which software reads kvmclock 
> >>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
> >>>> exposed.
> >>>>
> >>>>> Shouldn't the pause delay be recorded somewhere architecturally
> >>>>> independent and then be a thing that kvm-clock happens to use and
> >>>>> other clocks might as well?
> >>>>
> >>>> In theory, yes. In practice, i don't see the need for this... 
> >>>
> >>> It seems unlikely to me that x86 is the only one that will want
> >>> to do something similar.
> >>
> >> Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> 
> Let's cross the bridge when we get there.

That will mean it has the migration data in the wrong place
and any other clocks that need to be incremented by the same offset
will need a hook or be inconsistent with this calculation.

Dave

> For now I'm more interested in having a version of the patch that is
> clean and doesn't need advance_clock.
> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini Nov. 9, 2016, 4:33 p.m. UTC | #10
On 09/11/2016 17:28, Dr. David Alan Gilbert wrote:
> * Paolo Bonzini (pbonzini@redhat.com) wrote:
>>
>>
>> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>>>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>>>> This patch, relative to pre-copy migration codepath,
>>>>>>>> measures the time between vm_stop() and pre_save(),
>>>>>>>> which includes copying the remaining RAM to destination,
>>>>>>>> and advances the clock by that amount.
>>>>>>>>
>>>>>>>> In a VM with 5 seconds downtime, this reduces the guest
>>>>>>>> clock difference on destination from 5s to 0.2s.
>>>>>>>>
>>>>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>>>>>>
>>>>>>> One thing that bothers me is that it's only this clock that's
>>>>>>> getting corrected; doesn't it cause things to get upset when
>>>>>>> one clock moves and the others dont?
>>>>>>
>>>>>> If you are correlating the clocks, then yes.
>>>>>>
>>>>>> Older Linux guests get upset (marking the TSC clocksource unstable
>>>>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
>>>>>> in newer guests
>>>>>> (kvmclock interface to notify watchdog to not complain).
>>>>>>
>>>>>> Note marking TSC clocksource unstable on older guests is harmless
>>>>>> because kvmclock is the standard clocksource.
>>>>>>
>>>>>> For Windows guests, i don't know that Windows correlates between different
>>>>>> clocks.
>>>>>>
>>>>>> That is, there is relative control as to which software reads kvmclock 
>>>>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
>>>>>> exposed.
>>>>>>
>>>>>>> Shouldn't the pause delay be recorded somewhere architecturally
>>>>>>> independent and then be a thing that kvm-clock happens to use and
>>>>>>> other clocks might as well?
>>>>>>
>>>>>> In theory, yes. In practice, i don't see the need for this... 
>>>>>
>>>>> It seems unlikely to me that x86 is the only one that will want
>>>>> to do something similar.
>>>>
>>>> Can't they copy what kvmclock is doing today? 
>>>
>>> We shouldn't have copies of code all over should we?
>>
>> Let's cross the bridge when we get there.
> 
> That will mean it has the migration data in the wrong place
> and any other clocks that need to be incremented by the same offset
> will need a hook or be inconsistent with this calculation.

No, there is no additional migration data that is needed.  This is just
a bug in how the pausing of CLOCK_MONOTONIC was implemented for the
kvmclock clocksource.

Right now, x86 is the only case where we have the problem, and x86 is
using a single "backend" for both kvmclock and the Hyper-V TSC reference
page.

For everyone else, there is no clocksource paravirtualization going on
(luckily, considering what a mess is kvmclock).  They can just use
QEMU_CLOCK_VIRTUAL if they want something that pauses during the VM.
Now, QEMU_CLOCK_VIRTUAL actually has the same bug that Marcelo is
fixing, so we may indeed want a common solution if possible.  But again,
let's see first what the code looks like for _one_ clocksource, before
writing a generalized (and thus more complex) solution.

Paolo
Marcelo Tosatti Nov. 9, 2016, 7:32 p.m. UTC | #11
On Tue, Nov 08, 2016 at 11:32:30AM -0200, Marcelo Tosatti wrote:
> On Tue, Nov 08, 2016 at 10:22:56AM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > > > This patch, relative to pre-copy migration codepath,
> > > > > > > measures the time between vm_stop() and pre_save(),
> > > > > > > which includes copying the remaining RAM to destination,
> > > > > > > and advances the clock by that amount.
> > > > > > > 
> > > > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > > > clock difference on destination from 5s to 0.2s.
> > > > > > > 
> > > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > > > 
> > > > > > One thing that bothers me is that it's only this clock that's
> > > > > > getting corrected; doesn't it cause things to get upset when
> > > > > > one clock moves and the others dont?
> > > > > 
> > > > > If you are correlating the clocks, then yes.
> > > > > 
> > > > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > > > in newer guests
> > > > > (kvmclock interface to notify watchdog to not complain).
> > > > > 
> > > > > Note marking TSC clocksource unstable on older guests is harmless
> > > > > because kvmclock is the standard clocksource.
> > > > > 
> > > > > For Windows guests, i don't know that Windows correlates between different
> > > > > clocks.
> > > > > 
> > > > > That is, there is relative control as to which software reads kvmclock 
> > > > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > > > exposed.
> > > > > 
> > > > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > > > independent and then be a thing that kvm-clock happens to use and
> > > > > > other clocks might as well?
> > > > > 
> > > > > In theory, yes. In practice, i don't see the need for this... 
> > > > 
> > > > It seems unlikely to me that x86 is the only one that will want
> > > > to do something similar.
> > > 
> > > Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> > 
> > Dave
> 
> Fine i'll add a notifier.
> 

KVM Linux guests all have to: 

Make CLOCK_MONOTONIC not count during vmpause
(because it mimicks behaviour under suspend/resume 
of baremetal). And because time overflows.

Assuming the HW clock counts while the VM is paused,
it means they have to hook into vmstate change
notifiers to:

        event           action
        vmstop          KVM_GET_CLOCK
        vmstart         KVM_SET_CLOCK (that earlier value)

For x86 we want to start counting the time there 
because while the VM is running, the host TSC 
is keeping track of time. 
So you measure the amount of time between:

        -> Guest VM clock stops ticking.
        -> clock_gettime(CLOCK_MONOTONIC, &pointA);
        ...
        -> presave: clock_gettime(CLOCK_MONOTONIC, &pointB);

I measured the additional time between presave and EOF: its about
5ms.

On destination, there is an additional 30ms between EOF receival 
and restoration of TSC.

Now, the clock difference is 130ms, and i am not sure where it 
comes from, trying to figure out. But the patch should give 35ms 
difference which is pretty good. Chasing that down...

So in summary: i don't see the point of making the code
"generic" without knowing what the other arches 
want to do.
Marcelo Tosatti Nov. 10, 2016, 11:48 a.m. UTC | #12
GOn Wed, Nov 09, 2016 at 05:23:50PM +0100, Paolo Bonzini wrote:
> 
> 
> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>>>> This patch, relative to pre-copy migration codepath,
> >>>>>> measures the time between vm_stop() and pre_save(),
> >>>>>> which includes copying the remaining RAM to destination,
> >>>>>> and advances the clock by that amount.
> >>>>>>
> >>>>>> In a VM with 5 seconds downtime, this reduces the guest
> >>>>>> clock difference on destination from 5s to 0.2s.
> >>>>>>
> >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>>>>
> >>>>> One thing that bothers me is that it's only this clock that's
> >>>>> getting corrected; doesn't it cause things to get upset when
> >>>>> one clock moves and the others dont?
> >>>>
> >>>> If you are correlating the clocks, then yes.
> >>>>
> >>>> Older Linux guests get upset (marking the TSC clocksource unstable
> >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> >>>> in newer guests
> >>>> (kvmclock interface to notify watchdog to not complain).
> >>>>
> >>>> Note marking TSC clocksource unstable on older guests is harmless
> >>>> because kvmclock is the standard clocksource.
> >>>>
> >>>> For Windows guests, i don't know that Windows correlates between different
> >>>> clocks.
> >>>>
> >>>> That is, there is relative control as to which software reads kvmclock 
> >>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
> >>>> exposed.
> >>>>
> >>>>> Shouldn't the pause delay be recorded somewhere architecturally
> >>>>> independent and then be a thing that kvm-clock happens to use and
> >>>>> other clocks might as well?
> >>>>
> >>>> In theory, yes. In practice, i don't see the need for this... 
> >>>
> >>> It seems unlikely to me that x86 is the only one that will want
> >>> to do something similar.
> >>
> >> Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> 
> Let's cross the bridge when we get there.
> 
> For now I'm more interested in having a version of the patch that is
> clean and doesn't need advance_clock.
> 
> Paolo


Destination has to run the following logic:

If (source has KVM_CAP_ADVANCE_CLOCK)
    use KVM_GET_CLOCK value
Else
   read pvclock from guest

To support migration from older QEMU versions which do not have
KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
hosts without KVM_CAP_ADVANCE_CLOCK).

I don't see any clean way to give that information, except changing
the migration format to pass "host: kvm_cap_advance_clock=true/false"
information.

Ideas?
Paolo Bonzini Nov. 10, 2016, 5:57 p.m. UTC | #13
On 10/11/2016 12:48, Marcelo Tosatti wrote:
> Destination has to run the following logic:
> 
> If (source has KVM_CAP_ADVANCE_CLOCK)
>     use KVM_GET_CLOCK value
> Else
>    read pvclock from guest
> 
> To support migration from older QEMU versions which do not have
> KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
> hosts without KVM_CAP_ADVANCE_CLOCK).
> 
> I don't see any clean way to give that information, except changing
> the migration format to pass "host: kvm_cap_advance_clock=true/false"
> information.

If you make it only affect new machine types, you could transmit a dummy
clock value such as -1 if the source does not have KVM_CLOCK_TSC_STABLE.

Paolo
Marcelo Tosatti Nov. 11, 2016, 2:23 p.m. UTC | #14
On Thu, Nov 10, 2016 at 06:57:21PM +0100, Paolo Bonzini wrote:
> 
> 
> On 10/11/2016 12:48, Marcelo Tosatti wrote:
> > Destination has to run the following logic:
> > 
> > If (source has KVM_CAP_ADVANCE_CLOCK)
> >     use KVM_GET_CLOCK value
> > Else
> >    read pvclock from guest
> > 
> > To support migration from older QEMU versions which do not have
> > KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
> > hosts without KVM_CAP_ADVANCE_CLOCK).
> > 
> > I don't see any clean way to give that information, except changing
> > the migration format to pass "host: kvm_cap_advance_clock=true/false"
> > information.
> 
> If you make it only affect new machine types, you could transmit a dummy
> clock value such as -1 if the source does not have KVM_CLOCK_TSC_STABLE.
> 
> Paolo

Prefer a new subsection (which is fine since migration is broken
anyway), because otherwise you have to deal with restoring
s->clock from -1 to what was read at KVM_GET_CLOCK (in case
migration fails).
Wanpeng Li Feb. 7, 2017, 10:02 a.m. UTC | #15
2016-11-08 3:41 GMT+08:00 Marcelo Tosatti <mtosatti@redhat.com>:
> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>> > This patch, relative to pre-copy migration codepath,
>> > measures the time between vm_stop() and pre_save(),
>> > which includes copying the remaining RAM to destination,
>> > and advances the clock by that amount.
>> >
>> > In a VM with 5 seconds downtime, this reduces the guest
>> > clock difference on destination from 5s to 0.2s.
>> >
>> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>
>> One thing that bothers me is that it's only this clock that's
>> getting corrected; doesn't it cause things to get upset when
>> one clock moves and the others dont?
>
> If you are correlating the clocks, then yes.
>
> Older Linux guests get upset (marking the TSC clocksource unstable
> because the watchdog checks TSC vs kvmclock), but there is a workaround for it
> in newer guests
> (kvmclock interface to notify watchdog to not complain).

Could you point out which interface? I didn't find it.

Regards,
Wanpeng Li

>
> Note marking TSC clocksource unstable on older guests is harmless
> because kvmclock is the standard clocksource.
>
> For Windows guests, i don't know that Windows correlates between different
> clocks.
>
> That is, there is relative control as to which software reads kvmclock
> or Windows TIMER MSR, so i don't see the need to advance every clock
> exposed.
>
>> Shouldn't the pause delay be recorded somewhere architecturally
>> independent and then be a thing that kvm-clock happens to use and
>> other clocks might as well?
>
> In theory, yes. In practice, i don't see the need for this...
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti Feb. 7, 2017, 12:18 p.m. UTC | #16
On Tue, Feb 07, 2017 at 06:02:13PM +0800, Wanpeng Li wrote:
> 2016-11-08 3:41 GMT+08:00 Marcelo Tosatti <mtosatti@redhat.com>:
> > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> > This patch, relative to pre-copy migration codepath,
> >> > measures the time between vm_stop() and pre_save(),
> >> > which includes copying the remaining RAM to destination,
> >> > and advances the clock by that amount.
> >> >
> >> > In a VM with 5 seconds downtime, this reduces the guest
> >> > clock difference on destination from 5s to 0.2s.
> >> >
> >> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>
> >> One thing that bothers me is that it's only this clock that's
> >> getting corrected; doesn't it cause things to get upset when
> >> one clock moves and the others dont?
> >
> > If you are correlating the clocks, then yes.
> >
> > Older Linux guests get upset (marking the TSC clocksource unstable
> > because the watchdog checks TSC vs kvmclock), but there is a workaround for it
> > in newer guests
> > (kvmclock interface to notify watchdog to not complain).
> 
> Could you point out which interface? I didn't find it.
> 
> Regards,
> Wanpeng Li

PVCLOCK_GUEST_STOPPED flag.

> 
> >
> > Note marking TSC clocksource unstable on older guests is harmless
> > because kvmclock is the standard clocksource.
> >
> > For Windows guests, i don't know that Windows correlates between different
> > clocks.
> >
> > That is, there is relative control as to which software reads kvmclock
> > or Windows TIMER MSR, so i don't see the need to advance every clock
> > exposed.
> >
> >> Shouldn't the pause delay be recorded somewhere architecturally
> >> independent and then be a thing that kvm-clock happens to use and
> >> other clocks might as well?
> >
> > In theory, yes. In practice, i don't see the need for this...
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 0f75dd3..a2a02ac 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -22,9 +22,11 @@ 
 #include "kvm_i386.h"
 #include "hw/sysbus.h"
 #include "hw/kvm/clock.h"
+#include "migration/migration.h"
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
+#include <time.h>
 
 #define TYPE_KVM_CLOCK "kvmclock"
 #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
@@ -35,7 +37,13 @@  typedef struct KVMClockState {
     /*< public >*/
 
     uint64_t clock;
+    uint64_t ns;
     bool clock_valid;
+
+    uint64_t advance_clock;
+    struct timespec t_aftervmstop;
+
+    bool adv_clock_enabled;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -100,6 +108,11 @@  static void kvmclock_vm_state_change(void *opaque, int running,
             s->clock = time_at_migration;
         }
 
+        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
+            s->clock += s->advance_clock;
+            s->advance_clock = 0;
+        }
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -135,6 +148,18 @@  static void kvmclock_vm_state_change(void *opaque, int running,
             abort();
         }
         s->clock = data.clock;
+        /*
+         * Transition from VM-running to VM-stopped via migration?
+         * Record when the VM was stopped.
+         */
+
+        if (state == RUN_STATE_FINISH_MIGRATE &&
+            !migration_in_postcopy(migrate_get_current())) {
+            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
+        } else {
+            s->t_aftervmstop.tv_sec = 0;
+            s->t_aftervmstop.tv_nsec = 0;
+        }
 
         /*
          * If the VM is stopped, declare the clock state valid to
@@ -152,6 +177,77 @@  static void kvmclock_realize(DeviceState *dev, Error **errp)
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static uint64_t clock_delta(struct timespec *before, struct timespec *after)
+{
+    if (before->tv_sec > after->tv_sec ||
+        (before->tv_sec == after->tv_sec &&
+         before->tv_nsec > after->tv_nsec)) {
+        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
+                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
+                        before->tv_nsec, after->tv_sec, after->tv_nsec);
+        abort();
+    }
+
+    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
+            after->tv_nsec - before->tv_nsec;
+}
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+    struct timespec now;
+    uint64_t ns;
+
+    if (s->t_aftervmstop.tv_sec == 0) {
+        return;
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &now);
+
+    ns = clock_delta(&s->t_aftervmstop, &now);
+
+    /*
+     * Linux guests can overflow if time jumps
+     * forward in large increments.
+     * Cap maximum adjustment to 10 minutes.
+     */
+    ns = MIN(ns, 600000000000ULL);
+
+    if (s->clock + ns > s->clock) {
+        s->ns = ns;
+    }
+}
+
+static int kvmclock_post_load(void *opaque, int version_id)
+{
+    KVMClockState *s = opaque;
+
+    /* save the value from incoming migration */
+    s->advance_clock = s->ns;
+
+    return 0;
+}
+
+static bool kvmclock_ns_needed(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->adv_clock_enabled;
+}
+
+static const VMStateDescription kvmclock_advance_ns = {
+    .name = "kvmclock/advance_ns",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_ns_needed,
+    .pre_save = kvmclock_pre_save,
+    .post_load = kvmclock_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(ns, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
@@ -159,15 +255,25 @@  static const VMStateDescription kvmclock_vmsd = {
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_advance_ns,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("advance_clock", KVMClockState, adv_clock_enabled, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 98dc772..243352e 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -370,6 +370,11 @@  bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "advance_clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\