Message ID | 20110301162500.GA14959@peq.hallyn.com |
---|---|
State | Superseded |
Headers | show |
On 03/01/2011 09:25 AM, Serge E. Hallyn wrote: > Hi, > > attached is a patch which fixes bug 714335. Here is the SRU > justification: > > 1. impact: KVM smp guests hang on AMD > 2. how was the bug addressed: Two upstream commits (plus a third > auxiliary) were re-implemented on lucid's kernel. > 3. patch: see below > 4. TEST CASE: in lucid, on an AMD box, start a guest with '-smp 2'. > 5. regression: Honestly this one scares me a bit because it was a > tricky cherrypick, and affects timekeeping for all kvm users on > x86 lucid. In theory timekeeping on guests could be adversely > affected, or guests could fail to boot. However, guests have > been tested on both AMD and intel hosts. > > From 2b08698ef4a73fd3fc025e3fa01dc0f524f591c6 Mon Sep 17 00:00:00 2001 > From: Serge E. Hallyn<serge.hallyn@canonical.com> > Date: Mon, 28 Feb 2011 20:48:41 +0000 > Subject: [PATCH 1/1] kvm: cherrypick three kvm fixes > > 1d5f066e0b63271b67eac6d3752f8aa96adcbddb: KVM: x86: Fix a possible backwards warp of kvmclock > and > 28e4639adf0c9f26f6bb56149b7ab547bf33bb95: KVM: x86: Fix kvmclock bug > and > 347bb4448c2155eb2310923ccaa4be5677649003: x86: pvclock: Move scale_delta into common header > > Hopefully this will fix qemu hangs on AMD systems which involve > -smp X (X>=2) and time. (LP: #714335) > > Signed-off-by: Serge E. Hallyn<serge.hallyn@canonical.com> > --- > arch/x86/include/asm/kvm_host.h | 3 ++ > arch/x86/include/asm/pvclock.h | 38 ++++++++++++++++++++++++++++++ > arch/x86/kvm/x86.c | 48 ++++++++++++++++++++++++++++++++++++-- > 3 files changed, 86 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 600807b..08bc2ff 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -357,6 +357,9 @@ struct kvm_vcpu_arch { > struct page *time_page; > > bool singlestep; /* guest is single stepped by KVM */ > + u64 last_guest_tsc; > + u64 last_kernel_ns; > + > bool nmi_pending; > bool nmi_injected; > > diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h > index 53235fd..44b9ae7 100644 > --- a/arch/x86/include/asm/pvclock.h > +++ b/arch/x86/include/asm/pvclock.h > @@ -11,4 +11,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, > struct pvclock_vcpu_time_info *vcpu, > struct timespec *ts); > > +/* > + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, > + * yielding a 64-bit result. > + */ > +static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) > +{ > + u64 product; > +#ifdef __i386__ > + u32 tmp1, tmp2; > +#endif > + > + if (shift< 0) > + delta>>= -shift; > + else > + delta<<= shift; > + > +#ifdef __i386__ > + __asm__ ( > + "mul %5 ; " > + "mov %4,%%eax ; " > + "mov %%edx,%4 ; " > + "mul %5 ; " > + "xor %5,%5 ; " > + "add %4,%%eax ; " > + "adc %5,%%edx ; " > + : "=A" (product), "=r" (tmp1), "=r" (tmp2) > + : "a" ((u32)delta), "1" ((u32)(delta>> 32)), "2" (mul_frac) ); > +#elif defined(__x86_64__) > + __asm__ ( > + "mul %%rdx ; shrd $32,%%rdx,%%rax" > + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); > +#else > +#error implement me! > +#endif > + > + return product; > +} > + > #endif /* _ASM_X86_PVCLOCK_H */ > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index b2c02a2..a857d05 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -47,6 +47,7 @@ > #include<asm/desc.h> > #include<asm/mtrr.h> > #include<asm/mce.h> > +#include<asm/pvclock.h> > > #define MAX_IO_MSRS 256 > #define CR0_RESERVED_BITS \ > @@ -633,6 +634,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) > struct kvm_vcpu_arch *vcpu =&v->arch; > void *shared_kaddr; > unsigned long this_tsc_khz; > + s64 kernel_ns, max_kernel_ns; > + u64 tsc_timestamp; > > if ((!vcpu->time_page)) > return; > @@ -646,15 +649,52 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) > > /* Keep irq disabled to prevent changes to the clock */ > local_irq_save(flags); > - kvm_get_msr(v, MSR_IA32_TSC,&vcpu->hv_clock.tsc_timestamp); > + //kvm_get_msr(v, MSR_IA32_TSC,&vcpu->hv_clock.tsc_timestamp); > + kvm_get_msr(v, MSR_IA32_TSC,&tsc_timestamp); > ktime_get_ts(&ts); > monotonic_to_bootbased(&ts); > local_irq_restore(flags); > + kernel_ns = timespec_to_ns(&ts); > + > + /* > + * Time as measured by the TSC may go backwards when resetting the base > + * tsc_timestamp. The reason for this is that the TSC resolution is > + * higher than the resolution of the other clock scales. Thus, many > + * possible measurments of the TSC correspond to one measurement of any > + * other clock, and so a spread of values is possible. This is not a > + * problem for the computation of the nanosecond clock; with TSC rates > + * around 1GHZ, there can only be a few cycles which correspond to one > + * nanosecond value, and any path through this code will inevitably > + * take longer than that. However, with the kernel_ns value itself, > + * the precision may be much lower, down to HZ granularity. If the > + * first sampling of TSC against kernel_ns ends in the low part of the > + * range, and the second in the high end of the range, we can get: > + * > + * (TSC - offset_low) * S + kns_old> (TSC - offset_high) * S + kns_new > + * > + * As the sampling errors potentially range in the thousands of cycles, > + * it is possible such a time value has already been observed by the > + * guest. To protect against this, we must compute the system time as > + * observed by the guest and ensure the new system time is greater. > + */ > + max_kernel_ns = 0; > + if (vcpu->hv_clock.tsc_timestamp&& vcpu->last_guest_tsc) { > + max_kernel_ns = vcpu->last_guest_tsc - > + vcpu->hv_clock.tsc_timestamp; > + max_kernel_ns = pvclock_scale_delta(max_kernel_ns, > + vcpu->hv_clock.tsc_to_system_mul, > + vcpu->hv_clock.tsc_shift); > + max_kernel_ns += vcpu->last_kernel_ns; > + } > + > + if (max_kernel_ns> kernel_ns) > + kernel_ns = max_kernel_ns; > > /* With all the info we got, fill in the values */ > > - vcpu->hv_clock.system_time = ts.tv_nsec + > - (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; > + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; > + vcpu->last_kernel_ns = kernel_ns; > + vcpu->last_guest_tsc = tsc_timestamp; > > /* > * The interface expects us to write an even number signaling that the > @@ -3695,6 +3735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) > kvm_x86_ops->prepare_guest_switch(vcpu); > kvm_load_guest_fpu(vcpu); > > + kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc); > + > local_irq_disable(); > > clear_bit(KVM_REQ_KICK,&vcpu->requests); Serge - Can we get this as 3 different patches, each with an appropriate 'backported from' or 'cherry-picked from' SHA1 ID.
Quoting Tim Gardner (tim.gardner@canonical.com): > Serge - Can we get this as 3 different patches, each with an > appropriate 'backported from' or 'cherry-picked from' SHA1 ID. Sure. They are actually not really 'independent fixes' IMO, they just went upstream that way, but if it helps bookkeeping then no problem. -serge
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 600807b..08bc2ff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -357,6 +357,9 @@ struct kvm_vcpu_arch { struct page *time_page; bool singlestep; /* guest is single stepped by KVM */ + u64 last_guest_tsc; + u64 last_kernel_ns; + bool nmi_pending; bool nmi_injected; diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 53235fd..44b9ae7 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -11,4 +11,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif defined(__x86_64__) + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b2c02a2..a857d05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -47,6 +47,7 @@ #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/mce.h> +#include <asm/pvclock.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -633,6 +634,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; + s64 kernel_ns, max_kernel_ns; + u64 tsc_timestamp; if ((!vcpu->time_page)) return; @@ -646,15 +649,52 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); + //kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); + kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); local_irq_restore(flags); + kernel_ns = timespec_to_ns(&ts); + + /* + * Time as measured by the TSC may go backwards when resetting the base + * tsc_timestamp. The reason for this is that the TSC resolution is + * higher than the resolution of the other clock scales. Thus, many + * possible measurments of the TSC correspond to one measurement of any + * other clock, and so a spread of values is possible. This is not a + * problem for the computation of the nanosecond clock; with TSC rates + * around 1GHZ, there can only be a few cycles which correspond to one + * nanosecond value, and any path through this code will inevitably + * take longer than that. However, with the kernel_ns value itself, + * the precision may be much lower, down to HZ granularity. If the + * first sampling of TSC against kernel_ns ends in the low part of the + * range, and the second in the high end of the range, we can get: + * + * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new + * + * As the sampling errors potentially range in the thousands of cycles, + * it is possible such a time value has already been observed by the + * guest. To protect against this, we must compute the system time as + * observed by the guest and ensure the new system time is greater. + */ + max_kernel_ns = 0; + if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + max_kernel_ns = vcpu->last_guest_tsc - + vcpu->hv_clock.tsc_timestamp; + max_kernel_ns = pvclock_scale_delta(max_kernel_ns, + vcpu->hv_clock.tsc_to_system_mul, + vcpu->hv_clock.tsc_shift); + max_kernel_ns += vcpu->last_kernel_ns; + } + + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; /* With all the info we got, fill in the values */ - vcpu->hv_clock.system_time = ts.tv_nsec + - (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_kernel_ns = kernel_ns; + vcpu->last_guest_tsc = tsc_timestamp; /* * The interface expects us to write an even number signaling that the @@ -3695,6 +3735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_x86_ops->prepare_guest_switch(vcpu); kvm_load_guest_fpu(vcpu); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + local_irq_disable(); clear_bit(KVM_REQ_KICK, &vcpu->requests);