Patchwork KVM: emulate lapic tsc deadline timer for hvm

login
register
mail settings
Submitter Liu, Jinsong
Date Aug. 17, 2011, 4:19 a.m.
Message ID <BC00F5384FCFC9499AF06F92E8B78A9E24888339F4@shsmsx502.ccr.corp.intel.com>
Download mbox | patch
Permalink /patch/110275/
State New
Headers show

Comments

Liu, Jinsong - Aug. 17, 2011, 4:19 a.m.
From a9670ddff84080c56183e2d678189e100f891174 Mon Sep 17 00:00:00 2001
From: Liu, Jinsong <jinsong.liu@intel.com>
Date: Wed, 17 Aug 2011 11:36:28 +0800
Subject: [PATCH] KVM: emulate lapic tsc deadline timer for hvm

This patch emulate lapic tsc deadline timer for hvm:
Enumerate tsc deadline timer capacibility by CPUID;
Enable tsc deadline timer mode by LAPIC MMIO;
Start tsc deadline timer by MSR;

Signed-off-by: Liu, Jinsong <jinsong.liu@intel.com>
---
 arch/x86/include/asm/apicdef.h    |    2 +
 arch/x86/include/asm/cpufeature.h |    1 +
 arch/x86/include/asm/kvm_host.h   |    1 +
 arch/x86/include/asm/msr-index.h  |    2 +
 arch/x86/kvm/kvm_timer.h          |    2 +
 arch/x86/kvm/lapic.c              |  119 ++++++++++++++++++++++++++++++-------
 arch/x86/kvm/lapic.h              |    3 +
 arch/x86/kvm/vmx.c                |   19 ++++++
 arch/x86/kvm/x86.c                |    6 ++
 9 files changed, 134 insertions(+), 21 deletions(-)
Avi Kivity - Aug. 22, 2011, 9:18 a.m.
On 08/17/2011 07:19 AM, Liu, Jinsong wrote:
>  From a9670ddff84080c56183e2d678189e100f891174 Mon Sep 17 00:00:00 2001
> From: Liu, Jinsong<jinsong.liu@intel.com>
> Date: Wed, 17 Aug 2011 11:36:28 +0800
> Subject: [PATCH] KVM: emulate lapic tsc deadline timer for hvm

kvm doesn't have hvm.

> This patch emulate lapic tsc deadline timer for hvm:
> Enumerate tsc deadline timer capacibility by CPUID;
> Enable tsc deadline timer mode by LAPIC MMIO;
> Start tsc deadline timer by MSR;

> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
> index 4258aac..28bcf48 100644
> --- a/arch/x86/include/asm/cpufeature.h
> +++ b/arch/x86/include/asm/cpufeature.h
> @@ -120,6 +120,7 @@
>   #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
>   #define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
>   #define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
> +#define X86_FEATURE_TSC_DEADLINE_TIMER    (4*32+24) /* Tsc deadline timer */
>   #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
>   #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
>   #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 307e3cf..28f7128 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -635,6 +635,7 @@ struct kvm_x86_ops {
>   	int (*check_intercept)(struct kvm_vcpu *vcpu,
>   			       struct x86_instruction_info *info,
>   			       enum x86_intercept_stage stage);
> +	u64 (*guest_to_host_tsc)(u64 guest_tsc);
>   };

Please put this near the other tsc functions.  Add a comment explaining 
what value is returned under nested virtualization.

Please add the svm callback implementation.

>
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -229,6 +229,8 @@
>   #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>   #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>
> +#define MSR_IA32_TSCDEADLINE		0x000006e0
> +
>   #define MSR_IA32_UCODE_WRITE		0x00000079
>   #define MSR_IA32_UCODE_REV		0x0000008b
>   

Need to add to msrs_to_save so live migration works.

>
> @@ -665,28 +682,30 @@ static void update_divide_count(struct kvm_lapic *apic)
>   static void start_apic_timer(struct kvm_lapic *apic)
>   {
>   	ktime_t now = apic->lapic_timer.timer.base->get_time();
> -
> -	apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
> -		    APIC_BUS_CYCLE_NS * apic->divide_count;
>   	atomic_set(&apic->lapic_timer.pending, 0);
>
> -	if (!apic->lapic_timer.period)
> -		return;
> -	/*
> -	 * Do not allow the guest to program periodic timers with small
> -	 * interval, since the hrtimers are not throttled by the host
> -	 * scheduler.
> -	 */
> -	if (apic_lvtt_period(apic)) {
> -		if (apic->lapic_timer.period<  NSEC_PER_MSEC/2)
> -			apic->lapic_timer.period = NSEC_PER_MSEC/2;
> -	}
> +	/* lapic timer in oneshot or peroidic mode */
> +	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
> +		apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
> +			    * APIC_BUS_CYCLE_NS * apic->divide_count;
>
> -	hrtimer_start(&apic->lapic_timer.timer,
> -		      ktime_add_ns(now, apic->lapic_timer.period),
> -		      HRTIMER_MODE_ABS);
> +		if (!apic->lapic_timer.period)
> +			return;
> +		/*
> +		 * Do not allow the guest to program periodic timers with small
> +		 * interval, since the hrtimers are not throttled by the host
> +		 * scheduler.
> +		 */
> +		if (apic_lvtt_period(apic)) {
> +			if (apic->lapic_timer.period<  NSEC_PER_MSEC/2)
> +				apic->lapic_timer.period = NSEC_PER_MSEC/2;
> +		}
>
> -	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
> +		hrtimer_start(&apic->lapic_timer.timer,
> +			      ktime_add_ns(now, apic->lapic_timer.period),
> +			      HRTIMER_MODE_ABS);
> +
> +		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
>   			   PRIx64 ", "
>   			   "timer initial count 0x%x, period %lldns, "
>   			   "expire @ 0x%016" PRIx64 ".\n", __func__,
> @@ -695,6 +714,26 @@ static void start_apic_timer(struct kvm_lapic *apic)
>   			   apic->lapic_timer.period,
>   			   ktime_to_ns(ktime_add_ns(now,
>   					apic->lapic_timer.period)));
> +	}
> +
> +	/* lapic timer in tsc deadline mode */
> +	if (apic_lvtt_tscdeadline(apic)) {

'else if' is slightly better, since it shows the reader the options are 
mutually exclusive.

> +		u64 tsc_now, tsc_target, tsc_delta, nsec;
> +
> +		if (!apic->lapic_timer.tscdeadline)
> +			return;
> +
> +		tsc_target = kvm_x86_ops->
> +			guest_to_host_tsc(apic->lapic_timer.tscdeadline);
> +		rdtscll(tsc_now);
> +		tsc_delta = tsc_target - tsc_now;

This only works if we have a constant tsc, that's not true for large 
multiboard machines.  Need to do this with irqs disabled as well 
(reading both 'now' and 'tsc_now' in the same critical section).

> +		if (tsc_delta<  0)
> +			tsc_delta = 0;
> +
> +		nsec = tsc_delta * 1000000L / tsc_khz;
> +		hrtimer_start(&apic->lapic_timer.timer,
> +			ktime_add_ns(now, nsec), HRTIMER_MODE_ABS);
> +	}
>   }
>
> @@ -883,6 +936,28 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
>    *----------------------------------------------------------------------
>    */
>
> +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +
> +	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
> +		return 0;

Why?

> +
> +	return apic->lapic_timer.tscdeadline;
> +}
> +
> +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +
> +	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
> +		return;
> +
> +	hrtimer_cancel(&apic->lapic_timer.timer);
> +	apic->lapic_timer.tscdeadline = data;
> +	start_apic_timer(apic);

Shouldn't the msr value be updated even if we're outside tsc-deadline mode?

> +}
> +


>   /*
>    * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
>    * ioctl. In this case the call-back should update internal vmx state to make
> @@ -6270,6 +6278,16 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
>   			}
>   		}
>   	}
> +
> +	/*
> +	 * Emulate Intel lapic tsc deadline timer even if host not support it.
> +	 * Open CPUID.1.ECX[24] and use bit17/18 as timer mode mask.
> +	 */
> +	best = kvm_find_cpuid_entry(vcpu, 1, 0);
> +	if (best) {
> +		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
> +		vcpu->arch.apic->lapic_timer.timer_mode_mask = (3<<  17);
> +	}
>   }

Should be in common code; there is nothing vmx specific about it 
(although it is Intel specific at present).
Marcelo Tosatti - Aug. 23, 2011, 10:47 a.m.
On Mon, Aug 22, 2011 at 12:18:49PM +0300, Avi Kivity wrote:
> On 08/17/2011 07:19 AM, Liu, Jinsong wrote:
> > From a9670ddff84080c56183e2d678189e100f891174 Mon Sep 17 00:00:00 2001
> >From: Liu, Jinsong<jinsong.liu@intel.com>
> >Date: Wed, 17 Aug 2011 11:36:28 +0800
> >Subject: [PATCH] KVM: emulate lapic tsc deadline timer for hvm
> 
> kvm doesn't have hvm.
> 
> >This patch emulate lapic tsc deadline timer for hvm:
> >Enumerate tsc deadline timer capacibility by CPUID;
> >Enable tsc deadline timer mode by LAPIC MMIO;
> >Start tsc deadline timer by MSR;
> 
> >diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
> >index 4258aac..28bcf48 100644
> >--- a/arch/x86/include/asm/cpufeature.h
> >+++ b/arch/x86/include/asm/cpufeature.h
> >@@ -120,6 +120,7 @@
> >  #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
> >  #define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
> >  #define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
> >+#define X86_FEATURE_TSC_DEADLINE_TIMER    (4*32+24) /* Tsc deadline timer */
> >  #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
> >  #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
> >  #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
> >diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >index 307e3cf..28f7128 100644
> >--- a/arch/x86/include/asm/kvm_host.h
> >+++ b/arch/x86/include/asm/kvm_host.h
> >@@ -635,6 +635,7 @@ struct kvm_x86_ops {
> >  	int (*check_intercept)(struct kvm_vcpu *vcpu,
> >  			       struct x86_instruction_info *info,
> >  			       enum x86_intercept_stage stage);
> >+	u64 (*guest_to_host_tsc)(u64 guest_tsc);
> >  };
> 
> Please put this near the other tsc functions.  Add a comment
> explaining what value is returned under nested virtualization.
> 
> Please add the svm callback implementation.
> 
> >
> >--- a/arch/x86/include/asm/msr-index.h
> >+++ b/arch/x86/include/asm/msr-index.h
> >@@ -229,6 +229,8 @@
> >  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
> >  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
> >
> >+#define MSR_IA32_TSCDEADLINE		0x000006e0
> >+
> >  #define MSR_IA32_UCODE_WRITE		0x00000079
> >  #define MSR_IA32_UCODE_REV		0x0000008b
> 
> Need to add to msrs_to_save so live migration works.

MSR must be explicitly listed in qemu, also.

> >+		if (!apic->lapic_timer.tscdeadline)
> >+			return;
> >+
> >+		tsc_target = kvm_x86_ops->
> >+			guest_to_host_tsc(apic->lapic_timer.tscdeadline);
> >+		rdtscll(tsc_now);
> >+		tsc_delta = tsc_target - tsc_now;
> 
> This only works if we have a constant tsc, that's not true for large
> multiboard machines.  Need to do this with irqs disabled as well
> (reading both 'now' and 'tsc_now' in the same critical section).

Should look like this:

local_irq_disable();
u64 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
if (guest_tsc <= tscdeadline)
        hrtimer_start(now);
else {
	ns = convert_to_ns(guest_tsc - tscdeadline);
	hrtimer_start(now + ns);
}
local_irq_enable();

Note the vcpus tsc can have different frequency than the hosts, so
vcpu_tsc_khz() should be used to convert to nanoseconds, not tsc_khz.

> >+		if (tsc_delta<  0)
> >+			tsc_delta = 0;
> >+
> >+		nsec = tsc_delta * 1000000L / tsc_khz;
> >+		hrtimer_start(&apic->lapic_timer.timer,
> >+			ktime_add_ns(now, nsec), HRTIMER_MODE_ABS);
> >+	}
> >  }
> >
> >@@ -883,6 +936,28 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
> >   *----------------------------------------------------------------------
> >   */
> >
> >+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
> >+{
> >+	struct kvm_lapic *apic = vcpu->arch.apic;
> >+
> >+	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
> >+		return 0;
> 
> Why?

The hardware reset value of the IA32_TSC_DEADLINE MSR is 0. In other
timer modes (LVT bit 18 = 0), the IA32_TSC_DEADLINE MSR reads zero and
writes are ignored.

> 
> >+
> >+	return apic->lapic_timer.tscdeadline;
> >+}
> >+
> >+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
> >+{
> >+	struct kvm_lapic *apic = vcpu->arch.apic;
> >+
> >+	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
> >+		return;
> >+
> >+	hrtimer_cancel(&apic->lapic_timer.timer);
> >+	apic->lapic_timer.tscdeadline = data;
> >+	start_apic_timer(apic);
> 
> Shouldn't the msr value be updated even if we're outside tsc-deadline mode?
Tian, Kevin - Aug. 29, 2011, 6:47 a.m.
> From: Marcelo Tosatti
> Sent: Tuesday, August 23, 2011 6:47 PM
> 
> > >+		if (!apic->lapic_timer.tscdeadline)
> > >+			return;
> > >+
> > >+		tsc_target = kvm_x86_ops->
> > >+			guest_to_host_tsc(apic->lapic_timer.tscdeadline);
> > >+		rdtscll(tsc_now);
> > >+		tsc_delta = tsc_target - tsc_now;
> >
> > This only works if we have a constant tsc, that's not true for large

that type of machine exposes tricky issues to time virtualization in 
general.

> > multiboard machines.  Need to do this with irqs disabled as well
> > (reading both 'now' and 'tsc_now' in the same critical section).
> 
> Should look like this:
> 
> local_irq_disable();
> u64 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
> if (guest_tsc <= tscdeadline)
>         hrtimer_start(now);
> else {
> 	ns = convert_to_ns(guest_tsc - tscdeadline);
> 	hrtimer_start(now + ns);
> }
> local_irq_enable();

Above is an overkill. only calculating guest tsc delta needs be protected.

On the other hand, I don't think masking irq here adds obvious benefit.
Virtualization doesn't ensure micro-level time accuracy, since there're
many events delaying virtual time interrupt injection even when timer
emulation logic triggers it timely. That's even the case on bare metal
though with smaller scope, and thus most guests are able to handle 
such situation. masking irq in non-critical regions simply slow down the
system response on other events. 

> 
> Note the vcpus tsc can have different frequency than the hosts, so
> vcpu_tsc_khz() should be used to convert to nanoseconds, not tsc_khz.
> 

yes, that's a good suggestion.

btw, Jinsong is on vacation this week. He will update the patch according
to you comment when back.

Thanks
Kevin
Liu, Jinsong - Sept. 6, 2011, 11:18 a.m.
Thanks Avi, Marcelo, Kevin for comments, sorry for late reply (just come back from vacation).


Avi Kivity wrote:
> On 08/17/2011 07:19 AM, Liu, Jinsong wrote:
>>  From a9670ddff84080c56183e2d678189e100f891174 Mon Sep 17 00:00:00
>> 2001 
>> From: Liu, Jinsong<jinsong.liu@intel.com>
>> Date: Wed, 17 Aug 2011 11:36:28 +0800
>> Subject: [PATCH] KVM: emulate lapic tsc deadline timer for hvm
> 
> kvm doesn't have hvm.
> 

Yes, I will update patch title and description, remove 'hvm'.


>> This patch emulate lapic tsc deadline timer for hvm:
>> Enumerate tsc deadline timer capacibility by CPUID;
>> Enable tsc deadline timer mode by LAPIC MMIO;
>> Start tsc deadline timer by MSR;
> 
>> diff --git a/arch/x86/include/asm/cpufeature.h
>> b/arch/x86/include/asm/cpufeature.h index 4258aac..28bcf48 100644
>> --- a/arch/x86/include/asm/cpufeature.h +++
>> b/arch/x86/include/asm/cpufeature.h @@ -120,6 +120,7 @@
>>   #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
>>   #define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
>>   #define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
>> +#define X86_FEATURE_TSC_DEADLINE_TIMER    (4*32+24) /* Tsc deadline
>>   timer */ #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
>>   #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV
>>   */ #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in
>> the OS */ 
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index 307e3cf..28f7128 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++
>> b/arch/x86/include/asm/kvm_host.h @@ -635,6 +635,7 @@ struct
>>   	kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu,
>>   			       struct x86_instruction_info *info,
>>   			       enum x86_intercept_stage stage);
>> +	u64 (*guest_to_host_tsc)(u64 guest_tsc);
>>   };
> 
> Please put this near the other tsc functions.  Add a comment
> explaining what value is returned under nested virtualization.

OK

> 
> Please add the svm callback implementation.
> 

It's un-necessary to add svm callback.
AMD lapic timer has no tsc deadline mode, svm callback would be an unused dummy function.


>> 
>> --- a/arch/x86/include/asm/msr-index.h
>> +++ b/arch/x86/include/asm/msr-index.h
>> @@ -229,6 +229,8 @@
>>   #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>>   #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>> 
>> +#define MSR_IA32_TSCDEADLINE		0x000006e0
>> +
>>   #define MSR_IA32_UCODE_WRITE		0x00000079
>>   #define MSR_IA32_UCODE_REV		0x0000008b
>> 
> 
> Need to add to msrs_to_save so live migration works.
> 

Fine

>> 
>> @@ -665,28 +682,30 @@ static void update_divide_count(struct
>>   kvm_lapic *apic) static void start_apic_timer(struct kvm_lapic
>>   	*apic)   { ktime_t now =
>> apic->lapic_timer.timer.base->get_time(); - 
>> -	apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
>> -		    APIC_BUS_CYCLE_NS * apic->divide_count;
>>   	atomic_set(&apic->lapic_timer.pending, 0);
>> 
>> -	if (!apic->lapic_timer.period)
>> -		return;
>> -	/*
>> -	 * Do not allow the guest to program periodic timers with small
>> -	 * interval, since the hrtimers are not throttled by the host
>> -	 * scheduler.
>> -	 */
>> -	if (apic_lvtt_period(apic)) {
>> -		if (apic->lapic_timer.period<  NSEC_PER_MSEC/2)
>> -			apic->lapic_timer.period = NSEC_PER_MSEC/2;
>> -	}
>> +	/* lapic timer in oneshot or peroidic mode */
>> +	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
>> +		apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
>> +			    * APIC_BUS_CYCLE_NS * apic->divide_count;
>> 
>> -	hrtimer_start(&apic->lapic_timer.timer,
>> -		      ktime_add_ns(now, apic->lapic_timer.period),
>> -		      HRTIMER_MODE_ABS);
>> +		if (!apic->lapic_timer.period)
>> +			return;
>> +		/*
>> +		 * Do not allow the guest to program periodic timers with small
>> +		 * interval, since the hrtimers are not throttled by the host +		
>> * scheduler. +		 */
>> +		if (apic_lvtt_period(apic)) {
>> +			if (apic->lapic_timer.period<  NSEC_PER_MSEC/2)
>> +				apic->lapic_timer.period = NSEC_PER_MSEC/2;
>> +		}
>> 
>> -	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
>> +		hrtimer_start(&apic->lapic_timer.timer,
>> +			      ktime_add_ns(now, apic->lapic_timer.period), +			     
>> HRTIMER_MODE_ABS); +
>> +		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"   			  
>>   			   PRIx64 ", " "timer initial count 0x%x, period %lldns, "
>>   			   "expire @ 0x%016" PRIx64 ".\n", __func__,
>> @@ -695,6 +714,26 @@ static void start_apic_timer(struct kvm_lapic
>>   			   *apic) apic->lapic_timer.period,
>>   			   ktime_to_ns(ktime_add_ns(now,
>>   					apic->lapic_timer.period)));
>> +	}
>> +
>> +	/* lapic timer in tsc deadline mode */
>> +	if (apic_lvtt_tscdeadline(apic)) {
> 
> 'else if' is slightly better, since it shows the reader the options
> are mutually exclusive.
> 

OK

>> +		u64 tsc_now, tsc_target, tsc_delta, nsec;
>> +
>> +		if (!apic->lapic_timer.tscdeadline)
>> +			return;
>> +
>> +		tsc_target = kvm_x86_ops->
>> +			guest_to_host_tsc(apic->lapic_timer.tscdeadline);
>> +		rdtscll(tsc_now); +		tsc_delta = tsc_target - tsc_now;
> 
> This only works if we have a constant tsc, that's not true for large

Agree with Kevin, variable tsc exposes tricky issues to time virtualization in general.
At some very old processors it maybe variable tsc, but all recent processors work on constant tsc regardless of Px and Cx.
For lapic tsc deadline timer capacibility cpu, I think it works on constant tsc otherwise os has no way to expect next absolute timepoint.

> multiboard machines.  Need to do this with irqs disabled as well
> (reading both 'now' and 'tsc_now' in the same critical section).
> 
>> +		if (tsc_delta<  0)
>> +			tsc_delta = 0;
>> +
>> +		nsec = tsc_delta * 1000000L / tsc_khz;
>> +		hrtimer_start(&apic->lapic_timer.timer,
>> +			ktime_add_ns(now, nsec), HRTIMER_MODE_ABS);
>> +	}
>>   }
>> 
>> @@ -883,6 +936,28 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
>>   
>> *----------------------------------------------------------------------
>> */  
>> 
>> +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) +{
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +
>> +	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
>> +		return 0;
> 
> Why?
> 

Intel SDM define such hardware behavior (10.5.4.1):
'In other timer modes (LVT bit 18 = 0), the IA32_TSC_DEADLINE MSR reads zero and writes are ignored.'

>> +
>> +	return apic->lapic_timer.tscdeadline;
>> +}
>> +
>> +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
>> +{ +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +
>> +	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) +		return;
>> +
>> +	hrtimer_cancel(&apic->lapic_timer.timer);
>> +	apic->lapic_timer.tscdeadline = data;
>> +	start_apic_timer(apic);
> 
> Shouldn't the msr value be updated even if we're outside tsc-deadline
> mode? 
> 

Same as above, SDM define such hardware behavior.

>> +}
>> +
> 
> 
>>   /*
>>    * Empty call-back. Needs to be implemented when VMX enables the
>> SET_TSC_KHZ 
>>    * ioctl. In this case the call-back should update internal vmx
>> state to make @@ -6270,6 +6278,16 @@ static void
>>   		vmx_cpuid_update(struct kvm_vcpu *vcpu)   			} }
>>   	}
>> +
>> +	/*
>> +	 * Emulate Intel lapic tsc deadline timer even if host not support
>> it. +	 * Open CPUID.1.ECX[24] and use bit17/18 as timer mode mask.
>> +	 */ +	best = kvm_find_cpuid_entry(vcpu, 1, 0);
>> +	if (best) {
>> +		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
>> +		vcpu->arch.apic->lapic_timer.timer_mode_mask = (3<<  17); +	}
>>   }
> 
> Should be in common code; there is nothing vmx specific about it
> (although it is Intel specific at present).

AMD lapic timer has no tsc deadline mode and so cpuid.1.ecx[24], and its timer_mode_mask should be 1 << 17 (AMD APM 16.4.1.)
It would be more safe to handle cpuid and lapic timer_mode_mask in arch code.


Thanks,
Jinsong
Liu, Jinsong - Sept. 6, 2011, 11:21 a.m.
Marcelo Tosatti wrote:
>>> --- a/arch/x86/include/asm/msr-index.h
>>> +++ b/arch/x86/include/asm/msr-index.h
>>> @@ -229,6 +229,8 @@
>>>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>>>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>>> 
>>> +#define MSR_IA32_TSCDEADLINE		0x000006e0
>>> +
>>>  #define MSR_IA32_UCODE_WRITE		0x00000079
>>>  #define MSR_IA32_UCODE_REV		0x0000008b
>> 
>> Need to add to msrs_to_save so live migration works.
> 
> MSR must be explicitly listed in qemu, also.
> 

OK

>>> +		if (!apic->lapic_timer.tscdeadline)
>>> +			return;
>>> +
>>> +		tsc_target = kvm_x86_ops->
>>> +			guest_to_host_tsc(apic->lapic_timer.tscdeadline);
>>> +		rdtscll(tsc_now); +		tsc_delta = tsc_target - tsc_now;
>> 
>> This only works if we have a constant tsc, that's not true for large
>> multiboard machines.  Need to do this with irqs disabled as well
>> (reading both 'now' and 'tsc_now' in the same critical section).
> 
> Should look like this:
> 
> local_irq_disable();
> u64 guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
> if (guest_tsc <= tscdeadline)
>         hrtimer_start(now);
> else {
> 	ns = convert_to_ns(guest_tsc - tscdeadline);
> 	hrtimer_start(now + ns);
> }
> local_irq_enable();
> 
> Note the vcpus tsc can have different frequency than the hosts, so
> vcpu_tsc_khz() should be used to convert to nanoseconds, not tsc_khz.
> 

Fine.


Thanks,
Jinsong
Avi Kivity - Sept. 6, 2011, 11:26 a.m.
On 09/06/2011 02:18 PM, Liu, Jinsong wrote:
> >>    			struct x86_instruction_info *info,
> >>    			enum x86_intercept_stage stage);
> >>  +	u64 (*guest_to_host_tsc)(u64 guest_tsc);
> >>    };
> >
> >  Please put this near the other tsc functions.  Add a comment
> >  explaining what value is returned under nested virtualization.
>
> OK
>
> >
> >  Please add the svm callback implementation.
> >
>
> It's un-necessary to add svm callback.
> AMD lapic timer has no tsc deadline mode, svm callback would be an unused dummy function.

It's a generic function, at the moment it's only used by tsc deadline 
timer but it could be used tomorrow for something else.

> >>  +		u64 tsc_now, tsc_target, tsc_delta, nsec;
> >>  +
> >>  +		if (!apic->lapic_timer.tscdeadline)
> >>  +			return;
> >>  +
> >>  +		tsc_target = kvm_x86_ops->
> >>  +			guest_to_host_tsc(apic->lapic_timer.tscdeadline);
> >>  +		rdtscll(tsc_now); +		tsc_delta = tsc_target - tsc_now;
> >
> >  This only works if we have a constant tsc, that's not true for large
>
> Agree with Kevin, variable tsc exposes tricky issues to time virtualization in general.
> At some very old processors it maybe variable tsc, but all recent processors work on constant tsc regardless of Px and Cx.
> For lapic tsc deadline timer capacibility cpu, I think it works on constant tsc otherwise os has no way to expect next absolute timepoint.

Well, we need something better here.  It's not just old cpus, it's also 
large multi-board machines.

> >>  +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) +{
> >>  +	struct kvm_lapic *apic = vcpu->arch.apic;
> >>  +
> >>  +	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
> >>  +		return 0;
> >
> >  Why?
> >
>
> Intel SDM define such hardware behavior (10.5.4.1):
> 'In other timer modes (LVT bit 18 = 0), the IA32_TSC_DEADLINE MSR reads zero and writes are ignored.'

Ok.

>
> >
> >>    /*
> >>     * Empty call-back. Needs to be implemented when VMX enables the
> >>  SET_TSC_KHZ
> >>     * ioctl. In this case the call-back should update internal vmx
> >>  state to make @@ -6270,6 +6278,16 @@ static void
> >>    		vmx_cpuid_update(struct kvm_vcpu *vcpu)   			} }
> >>    	}
> >>  +
> >>  +	/*
> >>  +	 * Emulate Intel lapic tsc deadline timer even if host not support
> >>  it. +	 * Open CPUID.1.ECX[24] and use bit17/18 as timer mode mask.
> >>  +	 */ +	best = kvm_find_cpuid_entry(vcpu, 1, 0);
> >>  +	if (best) {
> >>  +		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
> >>  +		vcpu->arch.apic->lapic_timer.timer_mode_mask = (3<<   17); +	}
> >>    }
> >
> >  Should be in common code; there is nothing vmx specific about it
> >  (although it is Intel specific at present).
>
> AMD lapic timer has no tsc deadline mode and so cpuid.1.ecx[24], and its timer_mode_mask should be 1<<  17 (AMD APM 16.4.1.)
> It would be more safe to handle cpuid and lapic timer_mode_mask in arch code.
>

Again, vmx.c is NOT about Intel specific code.  It is about vmx-specific 
code - programming vmx via VMREAD and VMWRITE.

(not to mention that we don't really need host cpu support for this - we 
can emulate tsc deadline mode without host cpu support at all, as this 
patchset shows).
Liu, Jinsong - Sept. 7, 2011, 4:45 p.m.
Avi Kivity wrote:
>> 
>> --- a/arch/x86/include/asm/msr-index.h
>> +++ b/arch/x86/include/asm/msr-index.h
>> @@ -229,6 +229,8 @@
>>   #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>>   #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>> 
>> +#define MSR_IA32_TSCDEADLINE		0x000006e0
>> +
>>   #define MSR_IA32_UCODE_WRITE		0x00000079
>>   #define MSR_IA32_UCODE_REV		0x0000008b
>> 
> 
> Need to add to msrs_to_save so live migration works.
> 

2 questions:
1). how about add to emulated_msrs instead of msrs_to_save? msrs_to_save modified at runtime and depend on capacibility of host cpu.
2). do we need add code at qemu (kvm_get_msrs/ kvm_put_msrs) to expose MSR_IA32_TSCDEADLINE to userspace?

Thanks,
Jinsong
Avi Kivity - Sept. 7, 2011, 5:06 p.m.
On 09/07/2011 07:45 PM, Liu, Jinsong wrote:
> Avi Kivity wrote:
> >>
> >>  --- a/arch/x86/include/asm/msr-index.h
> >>  +++ b/arch/x86/include/asm/msr-index.h
> >>  @@ -229,6 +229,8 @@
> >>    #define MSR_IA32_APICBASE_ENABLE	(1<<11)
> >>    #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
> >>
> >>  +#define MSR_IA32_TSCDEADLINE		0x000006e0
> >>  +
> >>    #define MSR_IA32_UCODE_WRITE		0x00000079
> >>    #define MSR_IA32_UCODE_REV		0x0000008b
> >>
> >
> >  Need to add to msrs_to_save so live migration works.
> >
>
> 2 questions:
> 1). how about add to emulated_msrs instead of msrs_to_save? msrs_to_save modified at runtime and depend on capacibility of host cpu.

Look at kvm_init_msrs_list(), it does the checks.

> 2). do we need add code at qemu (kvm_get_msrs/ kvm_put_msrs) to expose MSR_IA32_TSCDEADLINE to userspace?
>

It should be automatic.  Better check it though that you can 
live-migrate a guest that uses TSC deadline.  Please add a testcase to 
kvm-unit-tests.git (there's x86/apic.c, can probably be added there easily).
Liu, Jinsong - Sept. 7, 2011, 5:33 p.m.
Avi Kivity wrote:
> On 09/07/2011 07:45 PM, Liu, Jinsong wrote:
>> Avi Kivity wrote:
>>>> 
>>>>  --- a/arch/x86/include/asm/msr-index.h
>>>>  +++ b/arch/x86/include/asm/msr-index.h
>>>>  @@ -229,6 +229,8 @@
>>>>    #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>>>>    #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>>>> 
>>>>  +#define MSR_IA32_TSCDEADLINE		0x000006e0
>>>>  +
>>>>    #define MSR_IA32_UCODE_WRITE		0x00000079
>>>>    #define MSR_IA32_UCODE_REV		0x0000008b
>>>> 
>>> 
>>>  Need to add to msrs_to_save so live migration works.
>>> 
>> 
>> 2 questions:
>> 1). how about add to emulated_msrs instead of msrs_to_save?
>> msrs_to_save modified at runtime and depend on capacibility of host
>> cpu.  
> 
> Look at kvm_init_msrs_list(), it does the checks.
> 

Yes, what I mean is, we in fact don't need host cpu support, so it's better to add it to emulated_msrs.

>> 2). do we need add code at qemu (kvm_get_msrs/ kvm_put_msrs) to
>> expose MSR_IA32_TSCDEADLINE to userspace? 
>> 
> 
> It should be automatic.  Better check it though that you can
> live-migrate a guest that uses TSC deadline.  Please add a testcase to
> kvm-unit-tests.git (there's x86/apic.c, can probably be added there
> easily).
Liu, Jinsong - Sept. 8, 2011, 5:12 p.m.
>>> --- a/arch/x86/include/asm/msr-index.h
>>> +++ b/arch/x86/include/asm/msr-index.h
>>> @@ -229,6 +229,8 @@
>>>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>>>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>>> 
>>> +#define MSR_IA32_TSCDEADLINE		0x000006e0
>>> +
>>>  #define MSR_IA32_UCODE_WRITE		0x00000079
>>>  #define MSR_IA32_UCODE_REV		0x0000008b
>> 
>> Need to add to msrs_to_save so live migration works.
> 
> MSR must be explicitly listed in qemu, also.
> 

Marcelo, seems MSR don't need explicitly list in qemu?
KVM side adding MSR_IA32_TSCDEADLINE to msrs_to_save is enough. Qemu will get it through KVM_GET_MSR_INDEX_LIST.
Do I miss something?

Thanks,
Jinsong
Marcelo Tosatti - Sept. 9, 2011, 12:56 p.m.
On Fri, Sep 09, 2011 at 01:12:51AM +0800, Liu, Jinsong wrote:
> >>> --- a/arch/x86/include/asm/msr-index.h
> >>> +++ b/arch/x86/include/asm/msr-index.h
> >>> @@ -229,6 +229,8 @@
> >>>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
> >>>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
> >>> 
> >>> +#define MSR_IA32_TSCDEADLINE		0x000006e0
> >>> +
> >>>  #define MSR_IA32_UCODE_WRITE		0x00000079
> >>>  #define MSR_IA32_UCODE_REV		0x0000008b
> >> 
> >> Need to add to msrs_to_save so live migration works.
> > 
> > MSR must be explicitly listed in qemu, also.
> > 
> 
> Marcelo, seems MSR don't need explicitly list in qemu?
> KVM side adding MSR_IA32_TSCDEADLINE to msrs_to_save is enough. Qemu will get it through KVM_GET_MSR_INDEX_LIST.
> Do I miss something?

Notice in target-i386/kvm.c the KVM_GET_MSR_INDEX_LIST list is only used
for MSR_STAR/MSR_HSAVE_PA presence detection.

Do you do need to explicitly add MSR_IA32_TSCDEADLINE to
kvm_get_msrs/kvm_put_msrs routines.
Liu, Jinsong - Sept. 9, 2011, 6:11 p.m.
Marcelo Tosatti wrote:
> On Fri, Sep 09, 2011 at 01:12:51AM +0800, Liu, Jinsong wrote:
>>>>> --- a/arch/x86/include/asm/msr-index.h
>>>>> +++ b/arch/x86/include/asm/msr-index.h
>>>>> @@ -229,6 +229,8 @@
>>>>>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>>>>>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>>>>> 
>>>>> +#define MSR_IA32_TSCDEADLINE		0x000006e0
>>>>> +
>>>>>  #define MSR_IA32_UCODE_WRITE		0x00000079
>>>>>  #define MSR_IA32_UCODE_REV		0x0000008b
>>>> 
>>>> Need to add to msrs_to_save so live migration works.
>>> 
>>> MSR must be explicitly listed in qemu, also.
>>> 
>> 
>> Marcelo, seems MSR don't need explicitly list in qemu?
>> KVM side adding MSR_IA32_TSCDEADLINE to msrs_to_save is enough. Qemu
>> will get it through KVM_GET_MSR_INDEX_LIST. Do I miss something?
> 
> Notice in target-i386/kvm.c the KVM_GET_MSR_INDEX_LIST list is only
> used for MSR_STAR/MSR_HSAVE_PA presence detection.

Yes

> 
> Do you do need to explicitly add MSR_IA32_TSCDEADLINE to
> kvm_get_msrs/kvm_put_msrs routines.

My question is, which kvm_get_msrs/kvm_put_msrs routine be used by live migration, the routine in target-i386/kvm.c, or in kvm/libkvm/libkvm-x86.c? They both have ioctl KVM_GET_MSR_INDEX_LIST/ KVM_GET_MSRS/ KVM_SET_MSRS, but I'm not clear their purpose/usage difference.

Thanks,
Jinsong
Liu, Jinsong - Sept. 9, 2011, 6:21 p.m.
Marcelo Tosatti wrote:
> On Fri, Sep 09, 2011 at 01:12:51AM +0800, Liu, Jinsong wrote:
>>>>> --- a/arch/x86/include/asm/msr-index.h
>>>>> +++ b/arch/x86/include/asm/msr-index.h
>>>>> @@ -229,6 +229,8 @@
>>>>>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
>>>>>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
>>>>> 
>>>>> +#define MSR_IA32_TSCDEADLINE		0x000006e0
>>>>> +
>>>>>  #define MSR_IA32_UCODE_WRITE		0x00000079
>>>>>  #define MSR_IA32_UCODE_REV		0x0000008b
>>>> 
>>>> Need to add to msrs_to_save so live migration works.
>>> 
>>> MSR must be explicitly listed in qemu, also.
>>> 
>> 
>> Marcelo, seems MSR don't need explicitly list in qemu?
>> KVM side adding MSR_IA32_TSCDEADLINE to msrs_to_save is enough. Qemu
>> will get it through KVM_GET_MSR_INDEX_LIST. Do I miss something?
> 
> Notice in target-i386/kvm.c the KVM_GET_MSR_INDEX_LIST list is only
> used for MSR_STAR/MSR_HSAVE_PA presence detection.

Yes, but in kvm/libkvm/libkvm-x86.c the KVM_GET_MSR_INDEX_LIST get all list.
That's what I want to make clear --> which one live migration use?

> 
> Do you do need to explicitly add MSR_IA32_TSCDEADLINE to
> kvm_get_msrs/kvm_put_msrs routines.
Marcelo Tosatti - Sept. 9, 2011, 6:36 p.m.
On Sat, Sep 10, 2011 at 02:11:36AM +0800, Liu, Jinsong wrote:
> Marcelo Tosatti wrote:
> > On Fri, Sep 09, 2011 at 01:12:51AM +0800, Liu, Jinsong wrote:
> >>>>> --- a/arch/x86/include/asm/msr-index.h
> >>>>> +++ b/arch/x86/include/asm/msr-index.h
> >>>>> @@ -229,6 +229,8 @@
> >>>>>  #define MSR_IA32_APICBASE_ENABLE	(1<<11)
> >>>>>  #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
> >>>>> 
> >>>>> +#define MSR_IA32_TSCDEADLINE		0x000006e0
> >>>>> +
> >>>>>  #define MSR_IA32_UCODE_WRITE		0x00000079
> >>>>>  #define MSR_IA32_UCODE_REV		0x0000008b
> >>>> 
> >>>> Need to add to msrs_to_save so live migration works.
> >>> 
> >>> MSR must be explicitly listed in qemu, also.
> >>> 
> >> 
> >> Marcelo, seems MSR don't need explicitly list in qemu?
> >> KVM side adding MSR_IA32_TSCDEADLINE to msrs_to_save is enough. Qemu
> >> will get it through KVM_GET_MSR_INDEX_LIST. Do I miss something?
> > 
> > Notice in target-i386/kvm.c the KVM_GET_MSR_INDEX_LIST list is only
> > used for MSR_STAR/MSR_HSAVE_PA presence detection.
> 
> Yes
> 
> > 
> > Do you do need to explicitly add MSR_IA32_TSCDEADLINE to
> > kvm_get_msrs/kvm_put_msrs routines.
> 
> My question is, which kvm_get_msrs/kvm_put_msrs routine be used by live migration, the routine in target-i386/kvm.c, or in kvm/libkvm/libkvm-x86.c? They both have ioctl KVM_GET_MSR_INDEX_LIST/ KVM_GET_MSRS/ KVM_SET_MSRS, but I'm not clear their purpose/usage difference.

kvm_get_msrs/kvm_put_msrs in target-i386/kvm.c. kvm/ directory is dead.

Patch

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 34595d5..3925d80 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -100,7 +100,9 @@ 
 #define		APIC_TIMER_BASE_CLKIN		0x0
 #define		APIC_TIMER_BASE_TMBASE		0x1
 #define		APIC_TIMER_BASE_DIV		0x2
+#define		APIC_LVT_TIMER_ONESHOT		(0 << 17)
 #define		APIC_LVT_TIMER_PERIODIC		(1 << 17)
+#define		APIC_LVT_TIMER_TSCDEADLINE	(2 << 17)
 #define		APIC_LVT_MASKED			(1 << 16)
 #define		APIC_LVT_LEVEL_TRIGGER		(1 << 15)
 #define		APIC_LVT_REMOTE_IRR		(1 << 14)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4258aac..28bcf48 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -120,6 +120,7 @@ 
 #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
 #define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
 #define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER    (4*32+24) /* Tsc deadline timer */
 #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
 #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
 #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 307e3cf..28f7128 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -635,6 +635,7 @@  struct kvm_x86_ops {
 	int (*check_intercept)(struct kvm_vcpu *vcpu,
 			       struct x86_instruction_info *info,
 			       enum x86_intercept_stage stage);
+	u64 (*guest_to_host_tsc)(u64 guest_tsc);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d52609a..a6962d9 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -229,6 +229,8 @@ 
 #define MSR_IA32_APICBASE_ENABLE	(1<<11)
 #define MSR_IA32_APICBASE_BASE		(0xfffff<<12)
 
+#define MSR_IA32_TSCDEADLINE		0x000006e0
+
 #define MSR_IA32_UCODE_WRITE		0x00000079
 #define MSR_IA32_UCODE_REV		0x0000008b
 
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 64bc6ea..497dbaa 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -2,6 +2,8 @@ 
 struct kvm_timer {
 	struct hrtimer timer;
 	s64 period; 				/* unit: ns */
+	u32 timer_mode_mask;
+	u64 tscdeadline;
 	atomic_t pending;			/* accumulated triggered timers */
 	bool reinject;
 	struct kvm_timer_ops *t_ops;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2b2255b..780a5b0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -135,9 +135,23 @@  static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
 	return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
 }
 
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+	return ((apic_get_reg(apic, APIC_LVTT) & 
+		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
+}
+
 static inline int apic_lvtt_period(struct kvm_lapic *apic)
 {
-	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+	return ((apic_get_reg(apic, APIC_LVTT) & 
+		apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+	return ((apic_get_reg(apic, APIC_LVTT) & 
+		apic->lapic_timer.timer_mode_mask) == 
+			APIC_LVT_TIMER_TSCDEADLINE);
 }
 
 static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -166,7 +180,7 @@  static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 }
 
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
-	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
+	LVT_MASK ,	/* part LVTT mask, timer mode mask added at runtime */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTPC */
 	LINT_MASK, LINT_MASK,	/* LVT0-1 */
@@ -570,6 +584,9 @@  static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 		break;
 
 	case APIC_TMCCT:	/* Timer CCR */
+		if (apic_lvtt_tscdeadline(apic))
+			return 0;
+
 		val = apic_get_tmcct(apic);
 		break;
 
@@ -665,28 +682,30 @@  static void update_divide_count(struct kvm_lapic *apic)
 static void start_apic_timer(struct kvm_lapic *apic)
 {
 	ktime_t now = apic->lapic_timer.timer.base->get_time();
-
-	apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
-		    APIC_BUS_CYCLE_NS * apic->divide_count;
 	atomic_set(&apic->lapic_timer.pending, 0);
 
-	if (!apic->lapic_timer.period)
-		return;
-	/*
-	 * Do not allow the guest to program periodic timers with small
-	 * interval, since the hrtimers are not throttled by the host
-	 * scheduler.
-	 */
-	if (apic_lvtt_period(apic)) {
-		if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
-			apic->lapic_timer.period = NSEC_PER_MSEC/2;
-	}
+	/* lapic timer in oneshot or peroidic mode */
+	if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+		apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+			    * APIC_BUS_CYCLE_NS * apic->divide_count;
 
-	hrtimer_start(&apic->lapic_timer.timer,
-		      ktime_add_ns(now, apic->lapic_timer.period),
-		      HRTIMER_MODE_ABS);
+		if (!apic->lapic_timer.period)
+			return;
+		/*
+		 * Do not allow the guest to program periodic timers with small
+		 * interval, since the hrtimers are not throttled by the host
+		 * scheduler.
+		 */
+		if (apic_lvtt_period(apic)) {
+			if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
+				apic->lapic_timer.period = NSEC_PER_MSEC/2;
+		}
 
-	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+		hrtimer_start(&apic->lapic_timer.timer,
+			      ktime_add_ns(now, apic->lapic_timer.period),
+			      HRTIMER_MODE_ABS);
+
+		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
 			   PRIx64 ", "
 			   "timer initial count 0x%x, period %lldns, "
 			   "expire @ 0x%016" PRIx64 ".\n", __func__,
@@ -695,6 +714,26 @@  static void start_apic_timer(struct kvm_lapic *apic)
 			   apic->lapic_timer.period,
 			   ktime_to_ns(ktime_add_ns(now,
 					apic->lapic_timer.period)));
+	}
+
+	/* lapic timer in tsc deadline mode */
+	if (apic_lvtt_tscdeadline(apic)) {
+		u64 tsc_now, tsc_target, tsc_delta, nsec;
+
+		if (!apic->lapic_timer.tscdeadline)
+			return;
+
+		tsc_target = kvm_x86_ops->
+			guest_to_host_tsc(apic->lapic_timer.tscdeadline);
+		rdtscll(tsc_now);
+		tsc_delta = tsc_target - tsc_now;
+		if (tsc_delta < 0)
+			tsc_delta = 0;
+
+		nsec = tsc_delta * 1000000L / tsc_khz;
+		hrtimer_start(&apic->lapic_timer.timer, 
+			ktime_add_ns(now, nsec), HRTIMER_MODE_ABS);
+	}
 }
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -782,7 +821,6 @@  static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 	case APIC_LVT0:
 		apic_manage_nmi_watchdog(apic, val);
-	case APIC_LVTT:
 	case APIC_LVTTHMR:
 	case APIC_LVTPC:
 	case APIC_LVT1:
@@ -796,7 +834,22 @@  static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 		break;
 
+	case APIC_LVTT:
+		if ((apic_get_reg(apic, APIC_LVTT) & 
+		     apic->lapic_timer.timer_mode_mask) !=
+		    (val & apic->lapic_timer.timer_mode_mask))
+			hrtimer_cancel(&apic->lapic_timer.timer);
+
+		if (!apic_sw_enabled(apic))
+			val |= APIC_LVT_MASKED;
+		val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+		apic_set_reg(apic, APIC_LVTT, val);
+		break;
+
 	case APIC_TMICT:
+		if (apic_lvtt_tscdeadline(apic))
+			break;
+
 		hrtimer_cancel(&apic->lapic_timer.timer);
 		apic_set_reg(apic, APIC_TMICT, val);
 		start_apic_timer(apic);
@@ -883,6 +936,28 @@  void kvm_free_lapic(struct kvm_vcpu *vcpu)
  *----------------------------------------------------------------------
  */
 
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+		return 0;
+
+	return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+		return;
+
+	hrtimer_cancel(&apic->lapic_timer.timer);
+	apic->lapic_timer.tscdeadline = data;
+	start_apic_timer(apic);
+}
+
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
@@ -1076,6 +1151,8 @@  int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	apic->lapic_timer.kvm = vcpu->kvm;
 	apic->lapic_timer.vcpu = vcpu;
 
+	apic->lapic_timer.timer_mode_mask = (1 << 17);
+
 	apic->base_address = APIC_DEFAULT_PHYS_BASE;
 	vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 52c9e6b..10c6ee6 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -41,6 +41,9 @@  int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
 void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
 void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e26629f..f4ec4d3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1747,6 +1747,14 @@  static u64 guest_read_tsc(void)
 	return host_tsc + tsc_offset;
 }
 
+static u64 vmx_guest_to_host_tsc(u64 guest_tsc)
+{
+	u64 tsc_offset;
+
+	tsc_offset = vmcs_read64(TSC_OFFSET);
+	return guest_tsc - tsc_offset;
+}
+
 /*
  * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
  * ioctl. In this case the call-back should update internal vmx state to make
@@ -6270,6 +6278,16 @@  static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 			}
 		}
 	}
+
+	/* 
+	 * Emulate Intel lapic tsc deadline timer even if host not support it.
+	 * Open CPUID.1.ECX[24] and use bit17/18 as timer mode mask.
+	 */
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	if (best) {
+		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
+		vcpu->arch.apic->lapic_timer.timer_mode_mask = (3 << 17);
+	}
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7015,6 +7033,7 @@  static struct kvm_x86_ops vmx_x86_ops = {
 	.set_tdp_cr3 = vmx_set_cr3,
 
 	.check_intercept = vmx_check_intercept,
+	.guest_to_host_tsc = vmx_guest_to_host_tsc,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6cb353c..c409fbe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1564,6 +1564,9 @@  int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_write(vcpu, msr, data);
+	case MSR_IA32_TSCDEADLINE:
+		kvm_set_lapic_tscdeadline_msr(vcpu, data);
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
@@ -1891,6 +1894,9 @@  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
 		return kvm_x2apic_msr_read(vcpu, msr, pdata);
 		break;
+	case MSR_IA32_TSCDEADLINE:
+		data = kvm_get_lapic_tscdeadline_msr(vcpu);
+		break;
 	case MSR_IA32_MISC_ENABLE:
 		data = vcpu->arch.ia32_misc_enable_msr;
 		break;