diff mbox

[lucid] fix smp kvm guests on AMD

Message ID 20110301162500.GA14959@peq.hallyn.com
State Superseded
Headers show

Commit Message

Serge Hallyn March 1, 2011, 4:25 p.m. UTC
Hi,

attached is a patch which fixes bug 714335.  Here is the SRU
justification:

1. impact: KVM smp guests hang on AMD
2. how was the bug addressed: Two upstream commits (plus a third
auxiliary) were re-implemented on lucid's kernel.
3. patch: see below
4. TEST CASE: in lucid, on an AMD box, start a guest with '-smp 2'.
5. regression: Honestly this one scares me a bit because it was a
tricky cherrypick, and affects timekeeping for all kvm users on
x86 lucid.  In theory timekeeping on guests could be adversely
affected, or guests could fail to boot.  However, guests have
been tested on both AMD and intel hosts.

From 2b08698ef4a73fd3fc025e3fa01dc0f524f591c6 Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serge.hallyn@canonical.com>
Date: Mon, 28 Feb 2011 20:48:41 +0000
Subject: [PATCH 1/1] kvm: cherrypick three kvm fixes

1d5f066e0b63271b67eac6d3752f8aa96adcbddb: KVM: x86: Fix a possible backwards warp of kvmclock
and
28e4639adf0c9f26f6bb56149b7ab547bf33bb95: KVM: x86: Fix kvmclock bug
and
347bb4448c2155eb2310923ccaa4be5677649003: x86: pvclock: Move scale_delta into common header

Hopefully this will fix qemu hangs on AMD systems which involve
-smp X (X >=2) and time.  (LP: #714335)

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
---
 arch/x86/include/asm/kvm_host.h |    3 ++
 arch/x86/include/asm/pvclock.h  |   38 ++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |   48 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 86 insertions(+), 3 deletions(-)

Comments

Tim Gardner March 2, 2011, 2:04 p.m. UTC | #1
On 03/01/2011 09:25 AM, Serge E. Hallyn wrote:
> Hi,
>
> attached is a patch which fixes bug 714335.  Here is the SRU
> justification:
>
> 1. impact: KVM smp guests hang on AMD
> 2. how was the bug addressed: Two upstream commits (plus a third
> auxiliary) were re-implemented on lucid's kernel.
> 3. patch: see below
> 4. TEST CASE: in lucid, on an AMD box, start a guest with '-smp 2'.
> 5. regression: Honestly this one scares me a bit because it was a
> tricky cherrypick, and affects timekeeping for all kvm users on
> x86 lucid.  In theory timekeeping on guests could be adversely
> affected, or guests could fail to boot.  However, guests have
> been tested on both AMD and intel hosts.
>
>  From 2b08698ef4a73fd3fc025e3fa01dc0f524f591c6 Mon Sep 17 00:00:00 2001
> From: Serge E. Hallyn<serge.hallyn@canonical.com>
> Date: Mon, 28 Feb 2011 20:48:41 +0000
> Subject: [PATCH 1/1] kvm: cherrypick three kvm fixes
>
> 1d5f066e0b63271b67eac6d3752f8aa96adcbddb: KVM: x86: Fix a possible backwards warp of kvmclock
> and
> 28e4639adf0c9f26f6bb56149b7ab547bf33bb95: KVM: x86: Fix kvmclock bug
> and
> 347bb4448c2155eb2310923ccaa4be5677649003: x86: pvclock: Move scale_delta into common header
>
> Hopefully this will fix qemu hangs on AMD systems which involve
> -smp X (X>=2) and time.  (LP: #714335)
>
> Signed-off-by: Serge E. Hallyn<serge.hallyn@canonical.com>
> ---
>   arch/x86/include/asm/kvm_host.h |    3 ++
>   arch/x86/include/asm/pvclock.h  |   38 ++++++++++++++++++++++++++++++
>   arch/x86/kvm/x86.c              |   48 ++++++++++++++++++++++++++++++++++++--
>   3 files changed, 86 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 600807b..08bc2ff 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -357,6 +357,9 @@ struct kvm_vcpu_arch {
>   	struct page *time_page;
>
>   	bool singlestep; /* guest is single stepped by KVM */
> +	u64 last_guest_tsc;
> +	u64 last_kernel_ns;
> +
>   	bool nmi_pending;
>   	bool nmi_injected;
>
> diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
> index 53235fd..44b9ae7 100644
> --- a/arch/x86/include/asm/pvclock.h
> +++ b/arch/x86/include/asm/pvclock.h
> @@ -11,4 +11,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
>   			    struct pvclock_vcpu_time_info *vcpu,
>   			    struct timespec *ts);
>
> +/*
> + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
> + * yielding a 64-bit result.
> + */
> +static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
> +{
> +	u64 product;
> +#ifdef __i386__
> +	u32 tmp1, tmp2;
> +#endif
> +
> +	if (shift<  0)
> +		delta>>= -shift;
> +	else
> +		delta<<= shift;
> +
> +#ifdef __i386__
> +	__asm__ (
> +		"mul  %5       ; "
> +		"mov  %4,%%eax ; "
> +		"mov  %%edx,%4 ; "
> +		"mul  %5       ; "
> +		"xor  %5,%5    ; "
> +		"add  %4,%%eax ; "
> +		"adc  %5,%%edx ; "
> +		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
> +		: "a" ((u32)delta), "1" ((u32)(delta>>  32)), "2" (mul_frac) );
> +#elif defined(__x86_64__)
> +	__asm__ (
> +		"mul %%rdx ; shrd $32,%%rdx,%%rax"
> +		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
> +#else
> +#error implement me!
> +#endif
> +
> +	return product;
> +}
> +
>   #endif /* _ASM_X86_PVCLOCK_H */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b2c02a2..a857d05 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -47,6 +47,7 @@
>   #include<asm/desc.h>
>   #include<asm/mtrr.h>
>   #include<asm/mce.h>
> +#include<asm/pvclock.h>
>
>   #define MAX_IO_MSRS 256
>   #define CR0_RESERVED_BITS						\
> @@ -633,6 +634,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
>   	struct kvm_vcpu_arch *vcpu =&v->arch;
>   	void *shared_kaddr;
>   	unsigned long this_tsc_khz;
> +	s64 kernel_ns, max_kernel_ns;
> +	u64 tsc_timestamp;
>
>   	if ((!vcpu->time_page))
>   		return;
> @@ -646,15 +649,52 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
>
>   	/* Keep irq disabled to prevent changes to the clock */
>   	local_irq_save(flags);
> -	kvm_get_msr(v, MSR_IA32_TSC,&vcpu->hv_clock.tsc_timestamp);
> +	//kvm_get_msr(v, MSR_IA32_TSC,&vcpu->hv_clock.tsc_timestamp);
> +	kvm_get_msr(v, MSR_IA32_TSC,&tsc_timestamp);
>   	ktime_get_ts(&ts);
>   	monotonic_to_bootbased(&ts);
>   	local_irq_restore(flags);
> +	kernel_ns = timespec_to_ns(&ts);
> +
> +	/*
> +	 * Time as measured by the TSC may go backwards when resetting the base
> +	 * tsc_timestamp.  The reason for this is that the TSC resolution is
> +	 * higher than the resolution of the other clock scales.  Thus, many
> +	 * possible measurments of the TSC correspond to one measurement of any
> +	 * other clock, and so a spread of values is possible.  This is not a
> +	 * problem for the computation of the nanosecond clock; with TSC rates
> +	 * around 1GHZ, there can only be a few cycles which correspond to one
> +	 * nanosecond value, and any path through this code will inevitably
> +	 * take longer than that.  However, with the kernel_ns value itself,
> +	 * the precision may be much lower, down to HZ granularity.  If the
> +	 * first sampling of TSC against kernel_ns ends in the low part of the
> +	 * range, and the second in the high end of the range, we can get:
> +	 *
> +	 * (TSC - offset_low) * S + kns_old>  (TSC - offset_high) * S + kns_new
> +	 *
> +	 * As the sampling errors potentially range in the thousands of cycles,
> +	 * it is possible such a time value has already been observed by the
> +	 * guest.  To protect against this, we must compute the system time as
> +	 * observed by the guest and ensure the new system time is greater.
> +	 */
> +	max_kernel_ns = 0;
> +	if (vcpu->hv_clock.tsc_timestamp&&  vcpu->last_guest_tsc) {
> +		max_kernel_ns = vcpu->last_guest_tsc -
> +			vcpu->hv_clock.tsc_timestamp;
> +		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
> +				vcpu->hv_clock.tsc_to_system_mul,
> +				vcpu->hv_clock.tsc_shift);
> +		max_kernel_ns += vcpu->last_kernel_ns;
> +	}
> +
> +	if (max_kernel_ns>  kernel_ns)
> +		kernel_ns = max_kernel_ns;
>
>   	/* With all the info we got, fill in the values */
>
> -	vcpu->hv_clock.system_time = ts.tv_nsec +
> -				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
> +	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
> +	vcpu->last_kernel_ns = kernel_ns;
> +	vcpu->last_guest_tsc = tsc_timestamp;
>
>   	/*
>   	 * The interface expects us to write an even number signaling that the
> @@ -3695,6 +3735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>   	kvm_x86_ops->prepare_guest_switch(vcpu);
>   	kvm_load_guest_fpu(vcpu);
>
> +	kvm_get_msr(vcpu, MSR_IA32_TSC,&vcpu->arch.last_guest_tsc);
> +
>   	local_irq_disable();
>
>   	clear_bit(KVM_REQ_KICK,&vcpu->requests);

Serge - Can we get this as 3 different patches, each with an appropriate 
'backported from' or 'cherry-picked from' SHA1 ID.
Serge Hallyn March 2, 2011, 3:31 p.m. UTC | #2
Quoting Tim Gardner (tim.gardner@canonical.com):
> Serge - Can we get this as 3 different patches, each with an
> appropriate 'backported from' or 'cherry-picked from' SHA1 ID.

Sure.  They are actually not really 'independent fixes' IMO,
they just went upstream that way, but if it helps bookkeeping
then no problem.

-serge
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 600807b..08bc2ff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -357,6 +357,9 @@  struct kvm_vcpu_arch {
 	struct page *time_page;
 
 	bool singlestep; /* guest is single stepped by KVM */
+	u64 last_guest_tsc;
+	u64 last_kernel_ns;
+
 	bool nmi_pending;
 	bool nmi_injected;
 
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd..44b9ae7 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -11,4 +11,42 @@  void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
 			    struct timespec *ts);
 
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b2c02a2..a857d05 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -47,6 +47,7 @@ 
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/pvclock.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -633,6 +634,8 @@  static void kvm_write_guest_time(struct kvm_vcpu *v)
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
 	unsigned long this_tsc_khz;
+	s64 kernel_ns, max_kernel_ns;
+	u64 tsc_timestamp;
 
 	if ((!vcpu->time_page))
 		return;
@@ -646,15 +649,52 @@  static void kvm_write_guest_time(struct kvm_vcpu *v)
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+	//kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
 	ktime_get_ts(&ts);
 	monotonic_to_bootbased(&ts);
 	local_irq_restore(flags);
+	kernel_ns = timespec_to_ns(&ts);
+
+	/*
+	 * Time as measured by the TSC may go backwards when resetting the base
+	 * tsc_timestamp.  The reason for this is that the TSC resolution is
+	 * higher than the resolution of the other clock scales.  Thus, many
+	 * possible measurments of the TSC correspond to one measurement of any
+	 * other clock, and so a spread of values is possible.  This is not a
+	 * problem for the computation of the nanosecond clock; with TSC rates
+	 * around 1GHZ, there can only be a few cycles which correspond to one
+	 * nanosecond value, and any path through this code will inevitably
+	 * take longer than that.  However, with the kernel_ns value itself,
+	 * the precision may be much lower, down to HZ granularity.  If the
+	 * first sampling of TSC against kernel_ns ends in the low part of the
+	 * range, and the second in the high end of the range, we can get:
+	 *
+	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+	 *
+	 * As the sampling errors potentially range in the thousands of cycles,
+	 * it is possible such a time value has already been observed by the
+	 * guest.  To protect against this, we must compute the system time as
+	 * observed by the guest and ensure the new system time is greater.
+	 */
+	max_kernel_ns = 0;
+	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+		max_kernel_ns = vcpu->last_guest_tsc -
+			vcpu->hv_clock.tsc_timestamp;
+		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+				vcpu->hv_clock.tsc_to_system_mul,
+				vcpu->hv_clock.tsc_shift);
+		max_kernel_ns += vcpu->last_kernel_ns;
+	}
+
+	if (max_kernel_ns > kernel_ns)
+		kernel_ns = max_kernel_ns;
 
 	/* With all the info we got, fill in the values */
 
-	vcpu->hv_clock.system_time = ts.tv_nsec +
-				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+	vcpu->last_kernel_ns = kernel_ns;
+	vcpu->last_guest_tsc = tsc_timestamp;
 
 	/*
 	 * The interface expects us to write an even number signaling that the
@@ -3695,6 +3735,8 @@  static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	kvm_load_guest_fpu(vcpu);
 
+	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
 	local_irq_disable();
 
 	clear_bit(KVM_REQ_KICK, &vcpu->requests);