diff mbox

[1/2] KVM: emulate lapic tsc deadline timer for guest

Message ID BC00F5384FCFC9499AF06F92E8B78A9E254455B6C3@shsmsx502.ccr.corp.intel.com
State New
Headers show

Commit Message

Liu, Jinsong Sept. 13, 2011, 2:36 p.m. UTC
From 7b12021e1d1b79797b49e41cc0a7be05a6180d9a Mon Sep 17 00:00:00 2001
From: Liu, Jinsong <jinsong.liu@intel.com>
Date: Tue, 13 Sep 2011 21:52:54 +0800
Subject: [PATCH] KVM: emulate lapic tsc deadline timer for guest

This patch emulate lapic tsc deadline timer for guest:
Enumerate tsc deadline timer capability by CPUID;
Enable tsc deadline timer mode by lapic MMIO;
Start tsc deadline timer by WRMSR;

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
---
 arch/x86/include/asm/apicdef.h    |    2 +
 arch/x86/include/asm/cpufeature.h |    3 +
 arch/x86/include/asm/kvm_host.h   |    2 +
 arch/x86/include/asm/msr-index.h  |    2 +
 arch/x86/kvm/kvm_timer.h          |    2 +
 arch/x86/kvm/lapic.c              |  122 ++++++++++++++++++++++++++++++-------
 arch/x86/kvm/lapic.h              |    3 +
 arch/x86/kvm/x86.c                |   20 ++++++-
 8 files changed, 132 insertions(+), 24 deletions(-)

Comments

Marcelo Tosatti Sept. 14, 2011, 11:45 a.m. UTC | #1
On Tue, Sep 13, 2011 at 10:36:51PM +0800, Liu, Jinsong wrote:
> >From 7b12021e1d1b79797b49e41cc0a7be05a6180d9a Mon Sep 17 00:00:00 2001
> From: Liu, Jinsong <jinsong.liu@intel.com>
> Date: Tue, 13 Sep 2011 21:52:54 +0800
> Subject: [PATCH] KVM: emulate lapic tsc deadline timer for guest
> 
> This patch emulate lapic tsc deadline timer for guest:
> Enumerate tsc deadline timer capability by CPUID;
> Enable tsc deadline timer mode by lapic MMIO;
> Start tsc deadline timer by WRMSR;
> 
> Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
> ---
>  arch/x86/include/asm/apicdef.h    |    2 +
>  arch/x86/include/asm/cpufeature.h |    3 +
>  arch/x86/include/asm/kvm_host.h   |    2 +
>  arch/x86/include/asm/msr-index.h  |    2 +
>  arch/x86/kvm/kvm_timer.h          |    2 +
>  arch/x86/kvm/lapic.c              |  122 ++++++++++++++++++++++++++++++-------
>  arch/x86/kvm/lapic.h              |    3 +
>  arch/x86/kvm/x86.c                |   20 ++++++-
>  8 files changed, 132 insertions(+), 24 deletions(-)
> 
> diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
> index 34595d5..3925d80 100644
> --- a/arch/x86/include/asm/apicdef.h
> +++ b/arch/x86/include/asm/apicdef.h
> @@ -100,7 +100,9 @@
>  #define		APIC_TIMER_BASE_CLKIN		0x0
>  #define		APIC_TIMER_BASE_TMBASE		0x1
>  #define		APIC_TIMER_BASE_DIV		0x2
> +#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
>  #define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
> +#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
>  #define		APIC_LVT_MASKED			(1 << 16)
>  #define		APIC_LVT_LEVEL_TRIGGER		(1 << 15)
>  #define		APIC_LVT_REMOTE_IRR		(1 << 14)

Please have a separate, introductory patch for definitions that are not 
KVM specific.

> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -671,6 +671,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
>  
>  extern bool tdp_enabled;
>  
> +extern u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
> +

No need for extern.

> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 2b2255b..925d4b9 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -135,9 +135,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
>  	return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
>  }
>  
> +static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
> +{
> +	return ((apic_get_reg(apic, APIC_LVTT) & 
> +		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
> +}
> +
>  static inline int apic_lvtt_period(struct kvm_lapic *apic)
>  {
> -	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
> +	return ((apic_get_reg(apic, APIC_LVTT) & 
> +		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
> +}
> +
> +static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
> +{
> +	return ((apic_get_reg(apic, APIC_LVTT) & 
> +		apic->lapic_timer.timer_mode_mask) == 
> +			APIC_LVT_TIMER_TSCDEADLINE);
>  }
>  
>  static inline int apic_lvt_nmi_mode(u32 lvt_val)
> @@ -166,7 +180,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
>  }
>  
>  static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
> -	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
> +	LVT_MASK ,	/* part LVTT mask, timer mode mask added at runtime */
>  	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
>  	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
>  	LINT_MASK, LINT_MASK,	/* LVT0-1 */
> @@ -570,6 +584,9 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
>  		break;
>  
>  	case APIC_TMCCT:	/* Timer CCR */
> +		if (apic_lvtt_tscdeadline(apic))
> +			return 0;
> +
>  		val = apic_get_tmcct(apic);
>  		break;
>  
> @@ -664,29 +681,32 @@ static void update_divide_count(struct kvm_lapic *apic)
>  
>  static void start_apic_timer(struct kvm_lapic *apic)
>  {
> -	ktime_t now = apic->lapic_timer.timer.base->get_time();
> -
> -	apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
> -		    APIC_BUS_CYCLE_NS * apic->divide_count;
> +	ktime_t now;
>  	atomic_set(&apic->lapic_timer.pending, 0);
>  
> -	if (!apic->lapic_timer.period)
> -		return;
> -	/*
> -	 * Do not allow the guest to program periodic timers with small
> -	 * interval, since the hrtimers are not throttled by the host
> -	 * scheduler.
> -	 */
> -	if (apic_lvtt_period(apic)) {
> -		if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
> -			apic->lapic_timer.period = NSEC_PER_MSEC/2;
> -	}
> +	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
> +		/* lapic timer in oneshot or peroidic mode */
> +		now = apic->lapic_timer.timer.base->get_time();
> +		apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
> +			    * APIC_BUS_CYCLE_NS * apic->divide_count;
> +
> +		if (!apic->lapic_timer.period)
> +			return;
> +		/*
> +		 * Do not allow the guest to program periodic timers with small
> +		 * interval, since the hrtimers are not throttled by the host
> +		 * scheduler.
> +		 */
> +		if (apic_lvtt_period(apic)) {
> +			if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
> +				apic->lapic_timer.period = NSEC_PER_MSEC/2;
> +		}
>  
> -	hrtimer_start(&apic->lapic_timer.timer,
> -		      ktime_add_ns(now, apic->lapic_timer.period),
> -		      HRTIMER_MODE_ABS);
> +		hrtimer_start(&apic->lapic_timer.timer,
> +			      ktime_add_ns(now, apic->lapic_timer.period),
> +			      HRTIMER_MODE_ABS);
>  
> -	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
> +		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
>  			   PRIx64 ", "
>  			   "timer initial count 0x%x, period %lldns, "
>  			   "expire @ 0x%016" PRIx64 ".\n", __func__,
> @@ -695,6 +715,28 @@ static void start_apic_timer(struct kvm_lapic *apic)
>  			   apic->lapic_timer.period,
>  			   ktime_to_ns(ktime_add_ns(now,
>  					apic->lapic_timer.period)));
> +	} else if (apic_lvtt_tscdeadline(apic)) {
> +		/* lapic timer in tsc deadline mode */
> +		u64 guest_tsc, guest_tsc_delta, ns = 0;
> +		struct kvm_vcpu *vcpu = apic->vcpu;
> +		unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
> +		unsigned long flags;
> +
> +		if (unlikely(!apic->lapic_timer.tscdeadline || !this_tsc_khz))
> +			return;
> +
> +		local_irq_save(flags);
> +
> +		now = apic->lapic_timer.timer.base->get_time();
> +		kvm_get_msr(vcpu, MSR_IA32_TSC, &guest_tsc);

Use kvm_x86_ops->read_l1_tsc(vcpu) instead of direct MSR read
(to avoid reading L2 guest TSC in case of nested virt).

> +		guest_tsc_delta = apic->lapic_timer.tscdeadline - guest_tsc;

if (guest_tsc <= tscdeadline), the timer should start immediately.

> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6cb353c..a73c059 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -610,6 +610,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
>  		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
>  			best->ecx |= bit(X86_FEATURE_OSXSAVE);
>  	}
> +
> +	/* 
> +	 * When cpu has tsc deadline timer capacibility, use bit 17/18 
> +	 * as timer mode mask. Otherwise only use bit 17.
> +	 */
> +	if (cpu_has_tsc_deadline_timer && best->function == 0x1) {
> +		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
> +		vcpu->arch.apic->lapic_timer.timer_mode_mask = (3 << 17);
> +	} else
> +		vcpu->arch.apic->lapic_timer.timer_mode_mask = (1 << 17);
>  }

The deadline timer is entirely emulated, whether the host CPU supports
it or not is irrelevant.

Why was this changed from previous submissions?
Liu, Jinsong Sept. 15, 2011, 6:22 a.m. UTC | #2
Marcelo Tosatti wrote:
>> diff --git a/arch/x86/include/asm/apicdef.h
>> b/arch/x86/include/asm/apicdef.h 
>> index 34595d5..3925d80 100644
>> --- a/arch/x86/include/asm/apicdef.h
>> +++ b/arch/x86/include/asm/apicdef.h
>> @@ -100,7 +100,9 @@
>>  #define		APIC_TIMER_BASE_CLKIN		0x0
>>  #define		APIC_TIMER_BASE_TMBASE		0x1
>>  #define		APIC_TIMER_BASE_DIV		0x2
>> +#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
>>  #define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
>> +#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
>>  #define		APIC_LVT_MASKED			(1 << 16)
>>  #define		APIC_LVT_LEVEL_TRIGGER		(1 << 15)
>>  #define		APIC_LVT_REMOTE_IRR		(1 << 14)
> 
> Please have a separate, introductory patch for definitions that are
> not KVM specific.
> 

OK, will present a separate patch. BTW, will the separate patch still be send to kvm@vger.kernel.org?

>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -671,6 +671,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu
>> *vcpu, gfn_t gfn); 
>> 
>>  extern bool tdp_enabled;
>> 
>> +extern u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
>> +
> 
> No need for extern.
> 

Any special concern, or, for coding style? a little curious :)

>> +	} else if (apic_lvtt_tscdeadline(apic)) {
>> +		/* lapic timer in tsc deadline mode */
>> +		u64 guest_tsc, guest_tsc_delta, ns = 0;
>> +		struct kvm_vcpu *vcpu = apic->vcpu;
>> +		unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); +		unsigned long
>> flags; +
>> +		if (unlikely(!apic->lapic_timer.tscdeadline || !this_tsc_khz))
>> +			return; +
>> +		local_irq_save(flags);
>> +
>> +		now = apic->lapic_timer.timer.base->get_time();
>> +		kvm_get_msr(vcpu, MSR_IA32_TSC, &guest_tsc);
> 
> Use kvm_x86_ops->read_l1_tsc(vcpu) instead of direct MSR read
> (to avoid reading L2 guest TSC in case of nested virt).
> 

Fine. I use some old version kvm (Jul 22), and didn't notice Nadav's patch checked in Aug 2 with read_l1_tsc hook.
Thanks for tell me.

>> +		guest_tsc_delta = apic->lapic_timer.tscdeadline - guest_tsc;
> 
> if (guest_tsc <= tscdeadline), the timer should start immediately.
> 
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 6cb353c..a73c059 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -610,6 +610,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
>>  		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
>>  			best->ecx |= bit(X86_FEATURE_OSXSAVE);
>>  	}
>> +
>> +	/*
>> +	 * When cpu has tsc deadline timer capacibility, use bit 17/18
>> +	 * as timer mode mask. Otherwise only use bit 17. +	 */
>> +	if (cpu_has_tsc_deadline_timer && best->function == 0x1) {
>> +		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
>> +		vcpu->arch.apic->lapic_timer.timer_mode_mask = (3 << 17); +	} else
>> +		vcpu->arch.apic->lapic_timer.timer_mode_mask = (1 << 17);
>>  }
> 
> The deadline timer is entirely emulated, whether the host CPU supports
> it or not is irrelevant.
> 
> Why was this changed from previous submissions?

Hmm, will explain in next email.

Thanks,
Jinsong
Liu, Jinsong Sept. 15, 2011, 8:17 a.m. UTC | #3
Marcelo Tosatti wrote:
>> +	} else if (apic_lvtt_tscdeadline(apic)) {
>> +		/* lapic timer in tsc deadline mode */
>> +		u64 guest_tsc, guest_tsc_delta, ns = 0;
>> +		struct kvm_vcpu *vcpu = apic->vcpu;
>> +		unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); +		unsigned long
>> flags; +
>> +		if (unlikely(!apic->lapic_timer.tscdeadline || !this_tsc_khz))
>> +			return; +
>> +		local_irq_save(flags);
>> +
>> +		now = apic->lapic_timer.timer.base->get_time();
>> +		kvm_get_msr(vcpu, MSR_IA32_TSC, &guest_tsc);
> 
> Use kvm_x86_ops->read_l1_tsc(vcpu) instead of direct MSR read
> (to avoid reading L2 guest TSC in case of nested virt).
> 
>> +		guest_tsc_delta = apic->lapic_timer.tscdeadline - guest_tsc;
> 
> if (guest_tsc <= tscdeadline), the timer should start immediately.
> 

Yes, under such case the timer does start immediately, with ns = 0

Thanks,
Jinsong
Marcelo Tosatti Sept. 15, 2011, 11:45 a.m. UTC | #4
On Thu, Sep 15, 2011 at 04:17:20PM +0800, Liu, Jinsong wrote:
> Marcelo Tosatti wrote:
> >> +	} else if (apic_lvtt_tscdeadline(apic)) {
> >> +		/* lapic timer in tsc deadline mode */
> >> +		u64 guest_tsc, guest_tsc_delta, ns = 0;
> >> +		struct kvm_vcpu *vcpu = apic->vcpu;
> >> +		unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); +		unsigned long
> >> flags; +
> >> +		if (unlikely(!apic->lapic_timer.tscdeadline || !this_tsc_khz))
> >> +			return; +
> >> +		local_irq_save(flags);
> >> +
> >> +		now = apic->lapic_timer.timer.base->get_time();
> >> +		kvm_get_msr(vcpu, MSR_IA32_TSC, &guest_tsc);
> > 
> > Use kvm_x86_ops->read_l1_tsc(vcpu) instead of direct MSR read
> > (to avoid reading L2 guest TSC in case of nested virt).
> > 
> >> +		guest_tsc_delta = apic->lapic_timer.tscdeadline - guest_tsc;
> > 
> > if (guest_tsc <= tscdeadline), the timer should start immediately.
> > 
> 
> Yes, under such case the timer does start immediately, with ns = 0

No, guest_tsc_delta is unsigned, so the "< 0" comparation fails.
Marcelo Tosatti Sept. 15, 2011, 12:15 p.m. UTC | #5
On Thu, Sep 15, 2011 at 02:22:58PM +0800, Liu, Jinsong wrote:
> Marcelo Tosatti wrote:
> >> diff --git a/arch/x86/include/asm/apicdef.h
> >> b/arch/x86/include/asm/apicdef.h 
> >> index 34595d5..3925d80 100644
> >> --- a/arch/x86/include/asm/apicdef.h
> >> +++ b/arch/x86/include/asm/apicdef.h
> >> @@ -100,7 +100,9 @@
> >>  #define		APIC_TIMER_BASE_CLKIN		0x0
> >>  #define		APIC_TIMER_BASE_TMBASE		0x1
> >>  #define		APIC_TIMER_BASE_DIV		0x2
> >> +#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
> >>  #define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
> >> +#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
> >>  #define		APIC_LVT_MASKED			(1 << 16)
> >>  #define		APIC_LVT_LEVEL_TRIGGER		(1 << 15)
> >>  #define		APIC_LVT_REMOTE_IRR		(1 << 14)
> > 
> > Please have a separate, introductory patch for definitions that are
> > not KVM specific.
> > 
> 
> OK, will present a separate patch. BTW, will the separate patch still be send to kvm@vger.kernel.org?

Yes.

> 
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -671,6 +671,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu
> >> *vcpu, gfn_t gfn); 
> >> 
> >>  extern bool tdp_enabled;
> >> 
> >> +extern u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
> >> +
> > 
> > No need for extern.
> > 
> 
> Any special concern, or, for coding style? a little curious :)

It is not necessary.
diff mbox

Patch

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 34595d5..3925d80 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -100,7 +100,9 @@ 
 #define		APIC_TIMER_BASE_CLKIN		0x0
 #define		APIC_TIMER_BASE_TMBASE		0x1
 #define		APIC_TIMER_BASE_DIV		0x2
+#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
 #define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
+#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
 #define		APIC_LVT_MASKED			(1 << 16)
 #define		APIC_LVT_LEVEL_TRIGGER		(1 << 15)
 #define		APIC_LVT_REMOTE_IRR		(1 << 14)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4258aac..8a26e48 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -120,6 +120,7 @@ 
 #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
 #define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
 #define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER    (4*32+24) /* Tsc deadline timer */
 #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
 #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
 #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
@@ -284,6 +285,8 @@  extern const char * const x86_power_flags[32];
 #define cpu_has_xmm4_1		boot_cpu_has(X86_FEATURE_XMM4_1)
 #define cpu_has_xmm4_2		boot_cpu_has(X86_FEATURE_XMM4_2)
 #define cpu_has_x2apic		boot_cpu_has(X86_FEATURE_X2APIC)
+#define cpu_has_tsc_deadline_timer	\
+				boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)
 #define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 307e3cf..2ce6529 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -671,6 +671,8 @@  u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern bool tdp_enabled;
 
+extern u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
+
 /* control of guest tsc rate supported? */
 extern bool kvm_has_tsc_control;
 /* minimum supported tsc_khz for guests */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d52609a..a6962d9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -229,6 +229,8 @@ 
 #define MSR_IA32_APICBASE_ENABLE	(1<<11)
 #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
 
+#define MSR_IA32_TSCDEADLINE		0x000006e0
+
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 64bc6ea..497dbaa 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -2,6 +2,8 @@ 
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
+	u32 timer_mode_mask;
+	u64 tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
 	bool reinject;
 	struct kvm_timer_ops *t_ops;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2b2255b..925d4b9 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -135,9 +135,23 @@  static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
 	return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
 }
 
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+	return ((apic_get_reg(apic, APIC_LVTT) & 
+		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
+}
+
 static inline int apic_lvtt_period(struct kvm_lapic *apic)
 {
-	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+	return ((apic_get_reg(apic, APIC_LVTT) & 
+		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+	return ((apic_get_reg(apic, APIC_LVTT) & 
+		apic->lapic_timer.timer_mode_mask) == 
+			APIC_LVT_TIMER_TSCDEADLINE);
 }
 
 static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -166,7 +180,7 @@  static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 }
 
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
-	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
+	LVT_MASK ,	/* part LVTT mask, timer mode mask added at runtime */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
 	LINT_MASK, LINT_MASK,	/* LVT0-1 */
@@ -570,6 +584,9 @@  static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 		break;
 
 	case APIC_TMCCT:	/* Timer CCR */
+		if (apic_lvtt_tscdeadline(apic))
+			return 0;
+
 		val = apic_get_tmcct(apic);
 		break;
 
@@ -664,29 +681,32 @@  static void update_divide_count(struct kvm_lapic *apic)
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
-	ktime_t now = apic->lapic_timer.timer.base->get_time();
-
-	apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
-		    APIC_BUS_CYCLE_NS * apic->divide_count;
+	ktime_t now;
 	atomic_set(&apic->lapic_timer.pending, 0);
 
-	if (!apic->lapic_timer.period)
-		return;
-	/*
-	 * Do not allow the guest to program periodic timers with small
-	 * interval, since the hrtimers are not throttled by the host
-	 * scheduler.
-	 */
-	if (apic_lvtt_period(apic)) {
-		if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
-			apic->lapic_timer.period = NSEC_PER_MSEC/2;
-	}
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+		/* lapic timer in oneshot or peroidic mode */
+		now = apic->lapic_timer.timer.base->get_time();
+		apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+			    * APIC_BUS_CYCLE_NS * apic->divide_count;
+
+		if (!apic->lapic_timer.period)
+			return;
+		/*
+		 * Do not allow the guest to program periodic timers with small
+		 * interval, since the hrtimers are not throttled by the host
+		 * scheduler.
+		 */
+		if (apic_lvtt_period(apic)) {
+			if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
+				apic->lapic_timer.period = NSEC_PER_MSEC/2;
+		}
 
-	hrtimer_start(&apic->lapic_timer.timer,
-		      ktime_add_ns(now, apic->lapic_timer.period),
-		      HRTIMER_MODE_ABS);
+		hrtimer_start(&apic->lapic_timer.timer,
+			      ktime_add_ns(now, apic->lapic_timer.period),
+			      HRTIMER_MODE_ABS);
 
-	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
 			   PRIx64 ", "
 			   "timer initial count 0x%x, period %lldns, "
 			   "expire @ 0x%016" PRIx64 ".\n", __func__,
@@ -695,6 +715,28 @@  static void start_apic_timer(struct kvm_lapic *apic)
 			   apic->lapic_timer.period,
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
+	} else if (apic_lvtt_tscdeadline(apic)) {
+		/* lapic timer in tsc deadline mode */
+		u64 guest_tsc, guest_tsc_delta, ns = 0;
+		struct kvm_vcpu *vcpu = apic->vcpu;
+		unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+		unsigned long flags;
+
+		if (unlikely(!apic->lapic_timer.tscdeadline || !this_tsc_khz))
+			return;
+
+		local_irq_save(flags);
+
+		now = apic->lapic_timer.timer.base->get_time();
+		kvm_get_msr(vcpu, MSR_IA32_TSC, &guest_tsc);
+		guest_tsc_delta = apic->lapic_timer.tscdeadline - guest_tsc;
+		if (likely(guest_tsc_delta > 0))
+			ns = guest_tsc_delta * 1000000UL / this_tsc_khz;
+		hrtimer_start(&apic->lapic_timer.timer,
+			ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+
+		local_irq_restore(flags);
+	}
 }
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -782,7 +824,6 @@  static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 	case APIC_LVT0:
 		apic_manage_nmi_watchdog(apic, val);
-	case APIC_LVTT:
 	case APIC_LVTTHMR:
 	case APIC_LVTPC:
 	case APIC_LVT1:
@@ -796,7 +837,22 @@  static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 		break;
 
+	case APIC_LVTT:
+		if ((apic_get_reg(apic, APIC_LVTT) & 
+		     apic->lapic_timer.timer_mode_mask) !=
+		    (val & apic->lapic_timer.timer_mode_mask))
+			hrtimer_cancel(&apic->lapic_timer.timer);
+
+		if (!apic_sw_enabled(apic))
+			val |= APIC_LVT_MASKED;
+		val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+		apic_set_reg(apic, APIC_LVTT, val);
+		break;
+
 	case APIC_TMICT:
+		if (apic_lvtt_tscdeadline(apic))
+			break;
+
 		hrtimer_cancel(&apic->lapic_timer.timer);
 		apic_set_reg(apic, APIC_TMICT, val);
 		start_apic_timer(apic);
@@ -883,6 +939,28 @@  void kvm_free_lapic(struct kvm_vcpu *vcpu)
  *----------------------------------------------------------------------
  */
 
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+		return 0;
+
+	return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+		return;
+
+	hrtimer_cancel(&apic->lapic_timer.timer);
+	apic->lapic_timer.tscdeadline = data;
+	start_apic_timer(apic);
+}
+
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 52c9e6b..10c6ee6 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -41,6 +41,9 @@  int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6cb353c..a73c059 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -610,6 +610,16 @@  static void update_cpuid(struct kvm_vcpu *vcpu)
 		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
 			best->ecx |= bit(X86_FEATURE_OSXSAVE);
 	}
+
+	/* 
+	 * When cpu has tsc deadline timer capacibility, use bit 17/18 
+	 * as timer mode mask. Otherwise only use bit 17.
+	 */
+	if (cpu_has_tsc_deadline_timer && best->function == 0x1) {
+		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
+		vcpu->arch.apic->lapic_timer.timer_mode_mask = (3 << 17);
+	} else
+		vcpu->arch.apic->lapic_timer.timer_mode_mask = (1 << 17);
 }
 
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -819,7 +829,7 @@  static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+	MSR_IA32_TSC, MSR_IA32_TSCDEADLINE, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 };
 
 static unsigned num_msrs_to_save;
@@ -1000,7 +1010,7 @@  static inline int kvm_tsc_changes_freq(void)
 	return ret;
 }
 
-static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.virtual_tsc_khz)
 		return vcpu->arch.virtual_tsc_khz;
@@ -1564,6 +1574,9 @@  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_write(vcpu, msr, data);
+	case MSR_IA32_TSCDEADLINE:
+		kvm_set_lapic_tscdeadline_msr(vcpu, data);
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
@@ -1891,6 +1904,9 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_read(vcpu, msr, pdata);
 		break;
+	case MSR_IA32_TSCDEADLINE:
+		data = kvm_get_lapic_tscdeadline_msr(vcpu);
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->arch.ia32_misc_enable_msr;
 		break;