diff mbox

kvmclock: Ensure time in migration never goes backward

Message ID 1399297882-3444-1-git-send-email-agraf@suse.de
State New
Headers show

Commit Message

Alexander Graf May 5, 2014, 1:51 p.m. UTC
When we migrate we ask the kernel about its current belief on what the guest
time would be. However, I've seen cases where the kvmclock guest structure
indicates a time more recent than the kvm returned time.

To make sure we never go backwards, calculate what the guest would have seen
as time at the point of migration and use that value instead of the kernel
returned one when it's more recent.

While this doesn't fix the underlying issue that the kernel's view of time
is skewed, it allows us to safely migrate guests even from sources that are
known broken.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 hw/i386/kvm/clock.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

Comments

Marcin Gibuła May 5, 2014, 5:46 p.m. UTC | #1
W dniu 2014-05-05 15:51, Alexander Graf pisze:
> When we migrate we ask the kernel about its current belief on what the guest
> time would be. However, I've seen cases where the kvmclock guest structure
> indicates a time more recent than the kvm returned time.

Hi,

is it possible to have kvmclock jumping forward?

Because I've reproducible case when at about 1 per 20 vm restores, VM 
freezes for couple of hours and then resumes with date few hundreds 
years ahead. Happens only with kvmclock.

And this patch seems to fix very similar issue so maybe it's all the 
same bug.
Alexander Graf May 5, 2014, 6:05 p.m. UTC | #2
> Am 05.05.2014 um 19:46 schrieb Marcin Gibuła <m.gibula@beyond.pl>:
> 
> W dniu 2014-05-05 15:51, Alexander Graf pisze:
>> When we migrate we ask the kernel about its current belief on what the guest
>> time would be. However, I've seen cases where the kvmclock guest structure
>> indicates a time more recent than the kvm returned time.
> 
> Hi,
> 
> is it possible to have kvmclock jumping forward?
> 
> Because I've reproducible case when at about 1 per 20 vm restores, VM freezes for couple of hours and then resumes with date few hundreds years ahead. Happens only with kvmclock.
> 
> And this patch seems to fix very similar issue so maybe it's all the same bug.

I'm fairly sure it is the exact same bug. Jumping backward is like jumping forward by a biiiiig amount :)

Alex

> 
> -- 
> mg
> 
> 
>
Marcin Gibuła May 5, 2014, 6:26 p.m. UTC | #3
>> is it possible to have kvmclock jumping forward?
>>
>> Because I've reproducible case when at about 1 per 20 vm restores, VM freezes for couple of hours and then resumes with date few hundreds years ahead. Happens only with kvmclock.
>>
>> And this patch seems to fix very similar issue so maybe it's all the same bug.
>
> I'm fairly sure it is the exact same bug. Jumping backward is like jumping forward by a biiiiig amount :)

Hi,

I've tested your path on my test VM... don't know if it's pure luck or 
not, but it didn't hang with over 70 restores.

The message "KVM Clock migrated backwards, using later time" fires every 
time, but VM is healthy after resume.
Marcelo Tosatti May 5, 2014, 11:23 p.m. UTC | #4
Hi Alexander,

On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
> When we migrate we ask the kernel about its current belief on what the guest
> time would be. 

KVM_GET_CLOCK which returns the time in "struct kvm_clock_data".

> However, I've seen cases where the kvmclock guest structure
> indicates a time more recent than the kvm returned time.

More details please:

1) By what algorithm you retrieve
and compare time in kvmclock guest structure and KVM_GET_CLOCK.
What are the results of the comparison.
And whether and backwards time was visible in the guest.

2) What is the host clocksource.

The test below is not a good one because:

T1) KVM_GET_CLOCK (save s->clock).
T2) save env->tsc.

The difference in scaled time between T1 and T2 is larger than 1
nanosecond, so the

(time_at_migration > s->clock)

check is almost always positive (what matters though is whether 
time backwards event can be seen reading kvmclock in the guest).

> To make sure we never go backwards, calculate what the guest would have seen
> as time at the point of migration and use that value instead of the kernel
> returned one when it's more recent.
> 
> While this doesn't fix the underlying issue that the kernel's view of time
> is skewed, it allows us to safely migrate guests even from sources that are
> known broken.
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>
> ---
>  hw/i386/kvm/clock.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 48 insertions(+)
> 
> diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> index 892aa02..c6521cf 100644
> --- a/hw/i386/kvm/clock.c
> +++ b/hw/i386/kvm/clock.c
> @@ -14,6 +14,7 @@
>   */
>  
>  #include "qemu-common.h"
> +#include "qemu/host-utils.h"
>  #include "sysemu/sysemu.h"
>  #include "sysemu/kvm.h"
>  #include "hw/sysbus.h"
> @@ -34,6 +35,47 @@ typedef struct KVMClockState {
>      bool clock_valid;
>  } KVMClockState;
>  
> +struct pvclock_vcpu_time_info {
> +    uint32_t   version;
> +    uint32_t   pad0;
> +    uint64_t   tsc_timestamp;
> +    uint64_t   system_time;
> +    uint32_t   tsc_to_system_mul;
> +    int8_t     tsc_shift;
> +    uint8_t    flags;
> +    uint8_t    pad[2];
> +} __attribute__((__packed__)); /* 32 bytes */
> +
> +static uint64_t kvmclock_current_nsec(KVMClockState *s)
> +{
> +    CPUState *cpu = first_cpu;
> +    CPUX86State *env = cpu->env_ptr;
> +    hwaddr kvmclock_struct_pa = env->system_time_msr & ~1ULL;
> +    uint64_t migration_tsc = env->tsc;
> +    struct pvclock_vcpu_time_info time;
> +    uint64_t delta;
> +    uint64_t nsec_lo;
> +    uint64_t nsec_hi;
> +    uint64_t nsec;
> +
> +    if (!(env->system_time_msr & 1ULL)) {
> +        /* KVM clock not active */
> +        return 0;
> +    }
> +
> +    cpu_physical_memory_read(kvmclock_struct_pa, &time, sizeof(time));
> +
> +    delta = migration_tsc - time.tsc_timestamp;
> +    if (time.tsc_shift < 0) {
> +        delta >>= -time.tsc_shift;
> +    } else {
> +        delta <<= time.tsc_shift;
> +    }
> +
> +    mulu64(&nsec_lo, &nsec_hi, delta, time.tsc_to_system_mul);
> +    nsec = (nsec_lo >> 32) | (nsec_hi << 32);
> +    return nsec + time.system_time;
> +}
>  
>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
> @@ -45,9 +87,15 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>  
>      if (running) {
>          struct kvm_clock_data data;
> +        uint64_t time_at_migration = kvmclock_current_nsec(s);
>  
>          s->clock_valid = false;
>  
> +        if (time_at_migration > s->clock) {
> +            fprintf(stderr, "KVM Clock migrated backwards, using later time\n");
> +            s->clock = time_at_migration;
> +        }
> +
>          data.clock = s->clock;
>          data.flags = 0;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> -- 
> 1.7.12.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Marcelo Tosatti May 5, 2014, 11:27 p.m. UTC | #5
On Mon, May 05, 2014 at 08:26:04PM +0200, Marcin Gibuła wrote:
> >>is it possible to have kvmclock jumping forward?
> >>
> >>Because I've reproducible case when at about 1 per 20 vm restores, VM freezes for couple of hours and then resumes with date few hundreds years ahead. Happens only with kvmclock.
> >>
> >>And this patch seems to fix very similar issue so maybe it's all the same bug.
> >
> >I'm fairly sure it is the exact same bug. Jumping backward is like jumping forward by a biiiiig amount :)
> 
> Hi,
> 
> I've tested your path on my test VM... don't know if it's pure luck
> or not, but it didn't hang with over 70 restores.
> 
> The message "KVM Clock migrated backwards, using later time" fires
> every time, but VM is healthy after resume.

What is the host clocksource? (cat
/sys/devices/system/clocksource/clocksource0/current_clocksource). 

And kernel version?
Marcelo Tosatti May 5, 2014, 11:31 p.m. UTC | #6
On Mon, May 05, 2014 at 08:23:43PM -0300, Marcelo Tosatti wrote:
> Hi Alexander,
> 
> On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
> > When we migrate we ask the kernel about its current belief on what the guest
> > time would be. 
> 
> KVM_GET_CLOCK which returns the time in "struct kvm_clock_data".
> 
> > However, I've seen cases where the kvmclock guest structure
> > indicates a time more recent than the kvm returned time.

This should not happen because the value returned by KVM_GET_CLOCK 
(get_kernel_ns() + kvmclock_offset) should be relatively in sync
with what is seen in the guest via kvmclock read.
Marcelo Tosatti May 5, 2014, 11:31 p.m. UTC | #7
Marcin,

Can you provide detailed instructions on how to reproduce the problem? 

Thanks

On Mon, May 05, 2014 at 08:27:10PM -0300, Marcelo Tosatti wrote:
> On Mon, May 05, 2014 at 08:26:04PM +0200, Marcin Gibuła wrote:
> > >>is it possible to have kvmclock jumping forward?
> > >>
> > >>Because I've reproducible case when at about 1 per 20 vm restores, VM freezes for couple of hours and then resumes with date few hundreds years ahead. Happens only with kvmclock.
> > >>
> > >>And this patch seems to fix very similar issue so maybe it's all the same bug.
> > >
> > >I'm fairly sure it is the exact same bug. Jumping backward is like jumping forward by a biiiiig amount :)
> > 
> > Hi,
> > 
> > I've tested your path on my test VM... don't know if it's pure luck
> > or not, but it didn't hang with over 70 restores.
> > 
> > The message "KVM Clock migrated backwards, using later time" fires
> > every time, but VM is healthy after resume.
> 
> What is the host clocksource? (cat
> /sys/devices/system/clocksource/clocksource0/current_clocksource). 
> 
> And kernel version?
>
Alexander Graf May 6, 2014, 7:11 a.m. UTC | #8
On 06.05.14 01:27, Marcelo Tosatti wrote:
> On Mon, May 05, 2014 at 08:26:04PM +0200, Marcin Gibuła wrote:
>>>> is it possible to have kvmclock jumping forward?
>>>>
>>>> Because I've reproducible case when at about 1 per 20 vm restores, VM freezes for couple of hours and then resumes with date few hundreds years ahead. Happens only with kvmclock.
>>>>
>>>> And this patch seems to fix very similar issue so maybe it's all the same bug.
>>> I'm fairly sure it is the exact same bug. Jumping backward is like jumping forward by a biiiiig amount :)
>> Hi,
>>
>> I've tested your path on my test VM... don't know if it's pure luck
>> or not, but it didn't hang with over 70 restores.
>>
>> The message "KVM Clock migrated backwards, using later time" fires
>> every time, but VM is healthy after resume.
> What is the host clocksource? (cat
> /sys/devices/system/clocksource/clocksource0/current_clocksource).
>
> And kernel version?

I've seen 3 different reports of this bug by now. One is Marcin where I 
don't have any details. One is running 3.0 plus patches and another one 
is running 3.14.

For the 3.0 case the host clock source is TSC.


Alex
Alexander Graf May 6, 2014, 7:16 a.m. UTC | #9
On 06.05.14 01:23, Marcelo Tosatti wrote:
> Hi Alexander,
>
> On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
>> When we migrate we ask the kernel about its current belief on what the guest
>> time would be.
> KVM_GET_CLOCK which returns the time in "struct kvm_clock_data".

Yes :)

>
>> However, I've seen cases where the kvmclock guest structure
>> indicates a time more recent than the kvm returned time.
> More details please:
>
> 1) By what algorithm you retrieve
> and compare time in kvmclock guest structure and KVM_GET_CLOCK.
> What are the results of the comparison.
> And whether and backwards time was visible in the guest.

I've managed to get my hands on a broken migration stream from Nick. 
There I looked at the curr_clocksource structure and saw that the last 
seen time on the kvmclock clock source was greater than the value that 
the kvmclock device migrated.

> 2) What is the host clocksource.
>
> The test below is not a good one because:
>
> T1) KVM_GET_CLOCK (save s->clock).
> T2) save env->tsc.
>
> The difference in scaled time between T1 and T2 is larger than 1
> nanosecond, so the
>
> (time_at_migration > s->clock)
>
> check is almost always positive (what matters though is whether
> time backwards event can be seen reading kvmclock in the guest).

Yeah, at first I was thinking it makes sense to just not use the 
KVM_GET_CLOCK value at all because it's redundant to the in-memory 
values. Maybe that is the better alternative after all.


Alex
Alexander Graf May 6, 2014, 7:18 a.m. UTC | #10
On 06.05.14 01:31, Marcelo Tosatti wrote:
> On Mon, May 05, 2014 at 08:23:43PM -0300, Marcelo Tosatti wrote:
>> Hi Alexander,
>>
>> On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
>>> When we migrate we ask the kernel about its current belief on what the guest
>>> time would be.
>> KVM_GET_CLOCK which returns the time in "struct kvm_clock_data".
>>
>>> However, I've seen cases where the kvmclock guest structure
>>> indicates a time more recent than the kvm returned time.
> This should not happen because the value returned by KVM_GET_CLOCK
> (get_kernel_ns() + kvmclock_offset) should be relatively in sync
> with what is seen in the guest via kvmclock read.
>

Yes, and it isn't. Any ideas why it's not? This patch really just uses 
the guest visible kvmclock time rather than the host view of it on 
migration.

There is definitely something very broken on the host's side since it 
does return a smaller time than the guest exposed interface indicates.


Alex
Marcin Gibuła May 6, 2014, 7:37 a.m. UTC | #11
> What is the host clocksource? (cat
> /sys/devices/system/clocksource/clocksource0/current_clocksource).

tsc

> And kernel version?

3.12.17
But I've seen this problem on earlier versions as well (3.8, 3.10).
Marcin Gibuła May 6, 2014, 8:07 a.m. UTC | #12
On 06.05.2014 01:31, Marcelo Tosatti wrote:
> Marcin,
>
> Can you provide detailed instructions on how to reproduce the problem?

Some time ago I found that some virtual machines hang after migration. 
Problem seemed to be linux specific (windows guests never hanged) and 
somehow releated to kvm-clock. After seeking solution on qemu-devel I 
got mail from a guy who also encountered same problem and had VM image 
saved. We were both able to reproduce the same hang, using different 
host kernels and configs.

This VM is clean install of Centos 6.5 but I'm currently not permitted 
to distribute its saved image.

How to reproduce:
1. Save VM to file,
2. Restore VM from file,
3. Connect to VNC,
4. If console is responsive, kill VM and goto 2.

The facts we established:
- VM _sometime_ hangs on restore. On my system it was about 1 on 20 
restores,
- I don't know if it's a bug with saving VM state or with restoring it. 
The random chance suggests the latter,
- VM hangs for some undeterminated time and then resumes with date in 
far future (i.e. 500 years ahead, but I've seen larger values as well),
- only system time on VM is corrupted, hwclock returns correct value,
- the frequency of this bug somehow depends on host kernel config. I.e. 
changing HZ to 250 makes it more likely,
- the VM is unresponsive to anything than NMI due to spinning with 
interrupts disabled in __run_timers function in kernel/timer.c:

while (time_after_eq(jiffies, base->timer_jiffies)) {
   ...
}

(though my disassembly skills are limited so take it with a grain of salt)

- running vm kernel with nokvmclock option fixes the problem,
- VM responds to NMI with usual message with constant timestamp (as if 
time is frozen in VM),
- I've seen this with all types of linux guests with different guest 
kernels - debian, ubuntu, centos,
- I've seen this with different host kernels as well, the earliest I 
tested was 3.8, the newest 3.12,
- I've seen this on both Intel and AMD hosts.

I'm attaching my host .config.
Marcin Gibuła May 6, 2014, 7:54 p.m. UTC | #13
> Yes, and it isn't. Any ideas why it's not? This patch really just uses
> the guest visible kvmclock time rather than the host view of it on
> migration.
>
> There is definitely something very broken on the host's side since it
> does return a smaller time than the guest exposed interface indicates.

Don't know if helps but here are example values from time_at_migration 
and s->clock from your patch.

Tested on 5 restores of saved VM that (used to) hang:

    s->clock      time_at_migration
157082235125698  157113284546655
157082235125698  157113298196976
157082235125698  157113284615117
157082235125698  157113284486601
157082235125698  157113284479740

Now, when I compare system time on guest with and without patch:

On unpatched qemu vm restores with date: Apr 18 06:56:36
On patched qemu it says: Apr 18 06:57:06
Nicholas Thomas May 7, 2014, 10:04 a.m. UTC | #14
Hi all,

On 06/05/14 08:16, Alexander Graf wrote:
> 
> On 06.05.14 01:23, Marcelo Tosatti wrote:
>
>> 1) By what algorithm you retrieve
>> and compare time in kvmclock guest structure and KVM_GET_CLOCK.
>> What are the results of the comparison.
>> And whether and backwards time was visible in the guest.
> 
> I've managed to get my hands on a broken migration stream from Nick.
> There I looked at the curr_clocksource structure and saw that the last
> seen time on the kvmclock clock source was greater than the value that
> the kvmclock device migrated.

We've been seeing live migration failures where the guest sees time go
backwards (= massive forward leap to the kernel, apparently)  for a
while now, affecting perhaps 5-10% of migrations we'd do (usually a
large proportion of the migrations on a few hosts, rather than an even
spread); initially in December, when we tried an upgrade to QEMU 1.7.1
and a 3.mumble (3.10?) kernel, from 1.5.0 and Debian's 3.2.

My testing at the time seemed to indicate that either upgrade - qemu or
kernel - caused the problems to show up. Guest symptoms are that the
kernel enters a tight loop in __run_timers and stays there. In the end,
I gave up and downgraded us again without any clear idea of what was
happening, or why.

In April, we finally got together a fairly reliable test case. This
patch resolves the guest hangs in that test, and I've also been able to
conduct > 1000 migrations of production guests without seeing the issue
recur. So,

Tested-by: Nick Thomas <nick@bytemark.co.uk>

/Nick
Marcelo Tosatti May 7, 2014, 11:21 p.m. UTC | #15
On Tue, May 06, 2014 at 09:18:27AM +0200, Alexander Graf wrote:
> 
> On 06.05.14 01:31, Marcelo Tosatti wrote:
> >On Mon, May 05, 2014 at 08:23:43PM -0300, Marcelo Tosatti wrote:
> >>Hi Alexander,
> >>
> >>On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
> >>>When we migrate we ask the kernel about its current belief on what the guest
> >>>time would be.
> >>KVM_GET_CLOCK which returns the time in "struct kvm_clock_data".
> >>
> >>>However, I've seen cases where the kvmclock guest structure
> >>>indicates a time more recent than the kvm returned time.
> >This should not happen because the value returned by KVM_GET_CLOCK
> >(get_kernel_ns() + kvmclock_offset) should be relatively in sync
> >with what is seen in the guest via kvmclock read.
> >
> 
> Yes, and it isn't. Any ideas why it's not? This patch really just
> uses the guest visible kvmclock time rather than the host view of it
> on migration.

Effective frequency of TSC clock could be higher than NTP frequency. So NTP
correction would slow down the host clock.

> There is definitely something very broken on the host's side since
> it does return a smaller time than the guest exposed interface
> indicates.
> 
> 
> Alex

Can you please retrieve the values of system_timestamp, tsc_timestamp and
the host clock on the trace you have ?

Nothing forbids backwards time even without migration, in case the problem
is the difference in frequency between TSC and NTP corrected host clock
(point is, should figure out what is happening).
Marcelo Tosatti May 7, 2014, 11:23 p.m. UTC | #16
On Tue, May 06, 2014 at 09:54:35PM +0200, Marcin Gibuła wrote:
> >Yes, and it isn't. Any ideas why it's not? This patch really just uses
> >the guest visible kvmclock time rather than the host view of it on
> >migration.
> >
> >There is definitely something very broken on the host's side since it
> >does return a smaller time than the guest exposed interface indicates.
> 
> Don't know if helps but here are example values from
> time_at_migration and s->clock from your patch.
> 
> Tested on 5 restores of saved VM that (used to) hang:
> 
>    s->clock      time_at_migration
> 157082235125698  157113284546655
> 157082235125698  157113298196976
> 157082235125698  157113284615117
> 157082235125698  157113284486601
> 157082235125698  157113284479740
> 
> Now, when I compare system time on guest with and without patch:
> 
> On unpatched qemu vm restores with date: Apr 18 06:56:36
> On patched qemu it says: Apr 18 06:57:06
> 
> -- 
> mg

Marcin,

Can you turn off the TSC clock on the host and attempt to reproduce the 
issue, without Alexander's patch? 

Just 

echo hpet > /sys/devices/system/clocksource/clocksource0/current_clocksource

(or any other clock except tsc if you don't have hpet).
TIA.
Alexander Graf May 7, 2014, 11:29 p.m. UTC | #17
On 08.05.14 01:21, Marcelo Tosatti wrote:
> On Tue, May 06, 2014 at 09:18:27AM +0200, Alexander Graf wrote:
>> On 06.05.14 01:31, Marcelo Tosatti wrote:
>>> On Mon, May 05, 2014 at 08:23:43PM -0300, Marcelo Tosatti wrote:
>>>> Hi Alexander,
>>>>
>>>> On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
>>>>> When we migrate we ask the kernel about its current belief on what the guest
>>>>> time would be.
>>>> KVM_GET_CLOCK which returns the time in "struct kvm_clock_data".
>>>>
>>>>> However, I've seen cases where the kvmclock guest structure
>>>>> indicates a time more recent than the kvm returned time.
>>> This should not happen because the value returned by KVM_GET_CLOCK
>>> (get_kernel_ns() + kvmclock_offset) should be relatively in sync
>>> with what is seen in the guest via kvmclock read.
>>>
>> Yes, and it isn't. Any ideas why it's not? This patch really just
>> uses the guest visible kvmclock time rather than the host view of it
>> on migration.
> Effective frequency of TSC clock could be higher than NTP frequency. So NTP
> correction would slow down the host clock.
>
>> There is definitely something very broken on the host's side since
>> it does return a smaller time than the guest exposed interface
>> indicates.
>>
>>
>> Alex
> Can you please retrieve the values of system_timestamp, tsc_timestamp and
> the host clock on the trace you have ?

These are the kvmclock data fields at the point in time of migration:

KVM Time
    |- version           0x22
    |- tsc_timestamp     0xa6af10dbce
    |- system_time       0x4eac2ae060
    |- tsc_to_system_mul 0xf28f5431
    |- tsc_shift         0xffffffff
    |- flags             0

and this is what other bits I could fetch from the migration:

         "env.tsc_offset": "0x0000000000000000",
         "env.tsc": "0x000004288b11c6f8",
         "env.tsc_aux": "0x0000000000000000",
     "timer (0)": {
         "cpu_ticks_offset": "0x00000427ee4d1efd",
         "dummy": "0x0000000000000000",
         "cpu_clock_offset": "0x000001f803249a8d"
     },
     "kvmclock (7)": {
         "clock": "0x000001f80325f418"
     },

I'm not sure how I could get access to the host clock, as this is a post 
mortem analysis and I haven't been able to reproduce this myself. 
However, I'm sure the two other folks who already replied to the thread 
would be more than happy to run something if we tell them what :).

>
> Nothing forbids backwards time even without migration, in case the problem
> is the difference in frequency between TSC and NTP corrected host clock
> (point is, should figure out what is happening).

Yes, I fully agree. We still need a patch similar to this one to ensure 
that we can fetch a guest from a broken host though. Maybe we should 
just have a CAP that indicates the host is working once we fix kvm and 
bump the QEMU kvmclock version to v2 with a new field that means "trust 
the value". Default it to "no" for v1 migrations and when it's set to 
"no", use the guest values instead.


Alex
Marcelo Tosatti May 8, 2014, 1:33 a.m. UTC | #18
On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
> When we migrate we ask the kernel about its current belief on what the guest
> time would be. However, I've seen cases where the kvmclock guest structure
> indicates a time more recent than the kvm returned time.
> 
> To make sure we never go backwards, calculate what the guest would have seen
> as time at the point of migration and use that value instead of the kernel
> returned one when it's more recent.
> 
> While this doesn't fix the underlying issue that the kernel's view of time
> is skewed, it allows us to safely migrate guests even from sources that are
> known broken.
> 
> Signed-off-by: Alexander Graf <agraf@suse.de>

OK Alexander better move this logic to the kernel, in KVM_GET_CLOCK.

Otherwise every user of KVM_GET_CLOCK would have to apply the
workaround.
Alexander Graf May 8, 2014, 7:21 a.m. UTC | #19
On 08.05.14 03:33, Marcelo Tosatti wrote:
> On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
>> When we migrate we ask the kernel about its current belief on what the guest
>> time would be. However, I've seen cases where the kvmclock guest structure
>> indicates a time more recent than the kvm returned time.
>>
>> To make sure we never go backwards, calculate what the guest would have seen
>> as time at the point of migration and use that value instead of the kernel
>> returned one when it's more recent.
>>
>> While this doesn't fix the underlying issue that the kernel's view of time
>> is skewed, it allows us to safely migrate guests even from sources that are
>> known broken.
>>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
> OK Alexander better move this logic to the kernel, in KVM_GET_CLOCK.
>
> Otherwise every user of KVM_GET_CLOCK would have to apply the
> workaround.

Well, the breakage occurs on the *source* of things. So if I have 100 
VMs running, I'm pretty sure one of them gets hit by this bug.

If I put the workaround in the kernel, I have to take the chance to 
break some of those 100 VMs to get things rolling. If I put it in QEMU, 
I can live with a broken migration source.

This gets even worse when you want to phase out unfixed host kernels.


Alex
Marcelo Tosatti May 9, 2014, 2:28 a.m. UTC | #20
On Thu, May 08, 2014 at 09:21:29AM +0200, Alexander Graf wrote:
> 
> On 08.05.14 03:33, Marcelo Tosatti wrote:
> >On Mon, May 05, 2014 at 03:51:22PM +0200, Alexander Graf wrote:
> >>When we migrate we ask the kernel about its current belief on what the guest
> >>time would be. However, I've seen cases where the kvmclock guest structure
> >>indicates a time more recent than the kvm returned time.
> >>
> >>To make sure we never go backwards, calculate what the guest would have seen
> >>as time at the point of migration and use that value instead of the kernel
> >>returned one when it's more recent.
> >>
> >>While this doesn't fix the underlying issue that the kernel's view of time
> >>is skewed, it allows us to safely migrate guests even from sources that are
> >>known broken.
> >>
> >>Signed-off-by: Alexander Graf <agraf@suse.de>
> >OK Alexander better move this logic to the kernel, in KVM_GET_CLOCK.
> >
> >Otherwise every user of KVM_GET_CLOCK would have to apply the
> >workaround.
> 
> Well, the breakage occurs on the *source* of things. So if I have
> 100 VMs running, I'm pretty sure one of them gets hit by this bug.
> 
> If I put the workaround in the kernel, I have to take the chance to
> break some of those 100 VMs to get things rolling. If I put it in
> QEMU, I can live with a broken migration source.
> 
> This gets even worse when you want to phase out unfixed host kernels.
> 
> 
> Alex

Alex,

Unability to upgrade systems is not an excuse to fix the bug in the
wrong place. 

I'll send a kernel patch.
Paolo Bonzini May 9, 2014, 11:53 a.m. UTC | #21
Il 09/05/2014 04:28, Marcelo Tosatti ha scritto:
> Alex,
>
> Unability to upgrade systems is not an excuse to fix the bug in the
> wrong place.

It may be an excuse to fix the bug in both places though.

Paolo
Alexander Graf May 12, 2014, 8:26 p.m. UTC | #22
On 09.05.14 13:53, Paolo Bonzini wrote:
> Il 09/05/2014 04:28, Marcelo Tosatti ha scritto:
>> Alex,
>>
>> Unability to upgrade systems is not an excuse to fix the bug in the
>> wrong place.
>
> It may be an excuse to fix the bug in both places though.

The bug in the kernel should be fixed differently though to ensure we 
don't go backwards in time during runtime, no?


Alex
Marcelo Tosatti May 14, 2014, 6:47 a.m. UTC | #23
On Fri, May 09, 2014 at 01:53:32PM +0200, Paolo Bonzini wrote:
> Il 09/05/2014 04:28, Marcelo Tosatti ha scritto:
> >Alex,
> >
> >Unability to upgrade systems is not an excuse to fix the bug in the
> >wrong place.
> 
> It may be an excuse to fix the bug in both places though.
> 
> Paolo

Actually, its messy to fix this in kernel, because its necessary
to read TSC MSR of a remote vcpu.

So it seems more appropriate to do this in QEMU.

Sorry for the noise. Alexander, can you resend without the printf? 

Also checking for "larger than" is not necessary as discussed because
it is always true.
Marcelo Tosatti May 14, 2014, 7:26 a.m. UTC | #24
On Mon, May 12, 2014 at 10:26:37PM +0200, Alexander Graf wrote:
> 
> On 09.05.14 13:53, Paolo Bonzini wrote:
> >Il 09/05/2014 04:28, Marcelo Tosatti ha scritto:
> >>Alex,
> >>
> >>Unability to upgrade systems is not an excuse to fix the bug in the
> >>wrong place.
> >
> >It may be an excuse to fix the bug in both places though.
> 
> The bug in the kernel should be fixed differently though to ensure
> we don't go backwards in time during runtime, no?

That can't happen because we don't ever update it, after
"KVM: x86: disable master clock if TSC is reset during suspend".
diff mbox

Patch

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 892aa02..c6521cf 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -14,6 +14,7 @@ 
  */
 
 #include "qemu-common.h"
+#include "qemu/host-utils.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/kvm.h"
 #include "hw/sysbus.h"
@@ -34,6 +35,47 @@  typedef struct KVMClockState {
     bool clock_valid;
 } KVMClockState;
 
+struct pvclock_vcpu_time_info {
+    uint32_t   version;
+    uint32_t   pad0;
+    uint64_t   tsc_timestamp;
+    uint64_t   system_time;
+    uint32_t   tsc_to_system_mul;
+    int8_t     tsc_shift;
+    uint8_t    flags;
+    uint8_t    pad[2];
+} __attribute__((__packed__)); /* 32 bytes */
+
+static uint64_t kvmclock_current_nsec(KVMClockState *s)
+{
+    CPUState *cpu = first_cpu;
+    CPUX86State *env = cpu->env_ptr;
+    hwaddr kvmclock_struct_pa = env->system_time_msr & ~1ULL;
+    uint64_t migration_tsc = env->tsc;
+    struct pvclock_vcpu_time_info time;
+    uint64_t delta;
+    uint64_t nsec_lo;
+    uint64_t nsec_hi;
+    uint64_t nsec;
+
+    if (!(env->system_time_msr & 1ULL)) {
+        /* KVM clock not active */
+        return 0;
+    }
+
+    cpu_physical_memory_read(kvmclock_struct_pa, &time, sizeof(time));
+
+    delta = migration_tsc - time.tsc_timestamp;
+    if (time.tsc_shift < 0) {
+        delta >>= -time.tsc_shift;
+    } else {
+        delta <<= time.tsc_shift;
+    }
+
+    mulu64(&nsec_lo, &nsec_hi, delta, time.tsc_to_system_mul);
+    nsec = (nsec_lo >> 32) | (nsec_hi << 32);
+    return nsec + time.system_time;
+}
 
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
@@ -45,9 +87,15 @@  static void kvmclock_vm_state_change(void *opaque, int running,
 
     if (running) {
         struct kvm_clock_data data;
+        uint64_t time_at_migration = kvmclock_current_nsec(s);
 
         s->clock_valid = false;
 
+        if (time_at_migration > s->clock) {
+            fprintf(stderr, "KVM Clock migrated backwards, using later time\n");
+            s->clock = time_at_migration;
+        }
+
         data.clock = s->clock;
         data.flags = 0;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);