diff mbox

[1/1,V5,tuning] kernel/kvm: introduce KVM_SET_LINT1 and fix improper nmi emulation

Message ID 4E980621.9050301@cn.fujitsu.com
State New
Headers show

Commit Message

Lai Jiangshan Oct. 14, 2011, 9:51 a.m. UTC
Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
button event happens. This doesn't properly emulate real hardware on
which NMI button event triggers LINT1. Because of this, NMI is sent to
the processor even when LINT1 is masked in LVT. For example, this
causes the problem that kdump initiated by NMI sometimes doesn't work
on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.

With this patch, we introduce introduce KVM_SET_LINT1,
and we can use KVM_SET_LINT1 to correctly emulate NMI button
without change the old KVM_NMI behavior.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
---
 arch/x86/kvm/irq.h   |    1 +
 arch/x86/kvm/lapic.c |    7 +++++++
 arch/x86/kvm/x86.c   |    8 ++++++++
 include/linux/kvm.h  |    3 +++
 4 files changed, 19 insertions(+), 0 deletions(-)

Comments

Sasha Levin Oct. 14, 2011, 11:59 a.m. UTC | #1
On Fri, 2011-10-14 at 17:51 +0800, Lai Jiangshan wrote:
> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
> button event happens. This doesn't properly emulate real hardware on
> which NMI button event triggers LINT1. Because of this, NMI is sent to
> the processor even when LINT1 is masked in LVT. For example, this
> causes the problem that kdump initiated by NMI sometimes doesn't work
> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
> 
> With this patch, we introduce introduce KVM_SET_LINT1,
> and we can use KVM_SET_LINT1 to correctly emulate NMI button
> without change the old KVM_NMI behavior.
> 
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
> ---

It could use a documentation update as well.

>  arch/x86/kvm/irq.h   |    1 +
>  arch/x86/kvm/lapic.c |    7 +++++++
>  arch/x86/kvm/x86.c   |    8 ++++++++
>  include/linux/kvm.h  |    3 +++
>  4 files changed, 19 insertions(+), 0 deletions(-)
> diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
> index 53e2d08..0c96315 100644
> --- a/arch/x86/kvm/irq.h
> +++ b/arch/x86/kvm/irq.h
> @@ -95,6 +95,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
>  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
>  void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu);
>  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
>  void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
>  void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 57dcbd4..87fe36a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1039,6 +1039,13 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
>  		kvm_apic_local_deliver(apic, APIC_LVT0);
>  }
>  
> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +
> +	kvm_apic_local_deliver(apic, APIC_LVT1);
> +}
> +
>  static struct kvm_timer_ops lapic_timer_ops = {
>  	.is_periodic = lapic_is_periodic,
>  };
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 84a28ea..fccd094 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2077,6 +2077,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_XSAVE:
>  	case KVM_CAP_ASYNC_PF:
>  	case KVM_CAP_GET_TSC_KHZ:
> +	case KVM_CAP_SET_LINT1:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> @@ -3264,6 +3265,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  
>  		goto out;
>  	}
> +	case KVM_SET_LINT1: {
> +		r = -EINVAL;
> +		if (!irqchip_in_kernel(vcpu->kvm))
> +			goto out;
> +		r = 0;
> +		kvm_apic_lint1_deliver(vcpu);

We simply ignore the return value of kvm_apic_local_deliver() and assume
it always works. why?
Jan Kiszka Oct. 14, 2011, 12:07 p.m. UTC | #2
On 2011-10-14 13:59, Sasha Levin wrote:
> On Fri, 2011-10-14 at 17:51 +0800, Lai Jiangshan wrote:
>> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
>> button event happens. This doesn't properly emulate real hardware on
>> which NMI button event triggers LINT1. Because of this, NMI is sent to
>> the processor even when LINT1 is masked in LVT. For example, this
>> causes the problem that kdump initiated by NMI sometimes doesn't work
>> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
>>
>> With this patch, we introduce introduce KVM_SET_LINT1,
>> and we can use KVM_SET_LINT1 to correctly emulate NMI button
>> without change the old KVM_NMI behavior.
>>
>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>> Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
>> ---
> 
> It could use a documentation update as well.
> 
>>  arch/x86/kvm/irq.h   |    1 +
>>  arch/x86/kvm/lapic.c |    7 +++++++
>>  arch/x86/kvm/x86.c   |    8 ++++++++
>>  include/linux/kvm.h  |    3 +++
>>  4 files changed, 19 insertions(+), 0 deletions(-)
>> diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
>> index 53e2d08..0c96315 100644
>> --- a/arch/x86/kvm/irq.h
>> +++ b/arch/x86/kvm/irq.h
>> @@ -95,6 +95,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
>>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
>>  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
>>  void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
>> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu);
>>  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
>>  void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
>>  void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 57dcbd4..87fe36a 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -1039,6 +1039,13 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
>>  		kvm_apic_local_deliver(apic, APIC_LVT0);
>>  }
>>  
>> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu)
>> +{
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +
>> +	kvm_apic_local_deliver(apic, APIC_LVT1);
>> +}
>> +
>>  static struct kvm_timer_ops lapic_timer_ops = {
>>  	.is_periodic = lapic_is_periodic,
>>  };
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 84a28ea..fccd094 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2077,6 +2077,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>>  	case KVM_CAP_XSAVE:
>>  	case KVM_CAP_ASYNC_PF:
>>  	case KVM_CAP_GET_TSC_KHZ:
>> +	case KVM_CAP_SET_LINT1:
>>  		r = 1;
>>  		break;
>>  	case KVM_CAP_COALESCED_MMIO:
>> @@ -3264,6 +3265,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>>  
>>  		goto out;
>>  	}
>> +	case KVM_SET_LINT1: {
>> +		r = -EINVAL;
>> +		if (!irqchip_in_kernel(vcpu->kvm))
>> +			goto out;
>> +		r = 0;
>> +		kvm_apic_lint1_deliver(vcpu);
> 
> We simply ignore the return value of kvm_apic_local_deliver() and assume
> it always works. why?
> 

Hmm, I suddenly realized that we switched from enhancing the KVM_NMI
IOCTL to adding KVM_SET_LINT1 - what motivated this?

( Maybe we should let the kernel part settle first before iterating
through user space changes. )

Jan
Lai Jiangshan Oct. 16, 2011, 3:01 p.m. UTC | #3
On 10/14/2011 08:07 PM, Jan Kiszka wrote:
> On 2011-10-14 13:59, Sasha Levin wrote:
>> On Fri, 2011-10-14 at 17:51 +0800, Lai Jiangshan wrote:
>>> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
>>> button event happens. This doesn't properly emulate real hardware on
>>> which NMI button event triggers LINT1. Because of this, NMI is sent to
>>> the processor even when LINT1 is masked in LVT. For example, this
>>> causes the problem that kdump initiated by NMI sometimes doesn't work
>>> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
>>>
>>> With this patch, we introduce introduce KVM_SET_LINT1,
>>> and we can use KVM_SET_LINT1 to correctly emulate NMI button
>>> without change the old KVM_NMI behavior.
>>>
>>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>>> Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
>>> ---
>>
>> It could use a documentation update as well.
>>
>>>  arch/x86/kvm/irq.h   |    1 +
>>>  arch/x86/kvm/lapic.c |    7 +++++++
>>>  arch/x86/kvm/x86.c   |    8 ++++++++
>>>  include/linux/kvm.h  |    3 +++
>>>  4 files changed, 19 insertions(+), 0 deletions(-)
>>> diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
>>> index 53e2d08..0c96315 100644
>>> --- a/arch/x86/kvm/irq.h
>>> +++ b/arch/x86/kvm/irq.h
>>> @@ -95,6 +95,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
>>>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
>>>  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
>>>  void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
>>> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu);
>>>  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
>>>  void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
>>>  void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>> index 57dcbd4..87fe36a 100644
>>> --- a/arch/x86/kvm/lapic.c
>>> +++ b/arch/x86/kvm/lapic.c
>>> @@ -1039,6 +1039,13 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
>>>  		kvm_apic_local_deliver(apic, APIC_LVT0);
>>>  }
>>>  
>>> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu)
>>> +{
>>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>>> +
>>> +	kvm_apic_local_deliver(apic, APIC_LVT1);
>>> +}
>>> +
>>>  static struct kvm_timer_ops lapic_timer_ops = {
>>>  	.is_periodic = lapic_is_periodic,
>>>  };
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index 84a28ea..fccd094 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -2077,6 +2077,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>>>  	case KVM_CAP_XSAVE:
>>>  	case KVM_CAP_ASYNC_PF:
>>>  	case KVM_CAP_GET_TSC_KHZ:
>>> +	case KVM_CAP_SET_LINT1:
>>>  		r = 1;
>>>  		break;
>>>  	case KVM_CAP_COALESCED_MMIO:
>>> @@ -3264,6 +3265,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>>>  
>>>  		goto out;
>>>  	}
>>> +	case KVM_SET_LINT1: {
>>> +		r = -EINVAL;
>>> +		if (!irqchip_in_kernel(vcpu->kvm))
>>> +			goto out;
>>> +		r = 0;
>>> +		kvm_apic_lint1_deliver(vcpu);
>>
>> We simply ignore the return value of kvm_apic_local_deliver() and assume
>> it always works. why?
>>
> 
> Hmm, I suddenly realized that we switched from enhancing the KVM_NMI
> IOCTL to adding KVM_SET_LINT1 - what motivated this?

Enhancing the KVM_NMI directly fixes the problem and matches the
real hard ware more, but it changes API bahavior.(we preferred to this one)

From the previous mails, I found you and Avi prefer to SET_LINT1
which keep old behavior and it is also OK for us.
But I found it is hard to be implemented before, and I switched
this one when you told me the clue.


> 
> ( Maybe we should let the kernel part settle first before iterating
> through user space changes. )
> 


Yes, you are right, we should settle the kernel-site at first,
But I need you and Avi's suggestions.

Thanks,
Lai
diff mbox

Patch

diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 53e2d08..0c96315 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -95,6 +95,7 @@  void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
+void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57dcbd4..87fe36a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1039,6 +1039,13 @@  void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
+void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	kvm_apic_local_deliver(apic, APIC_LVT1);
+}
+
 static struct kvm_timer_ops lapic_timer_ops = {
 	.is_periodic = lapic_is_periodic,
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84a28ea..fccd094 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2077,6 +2077,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
+	case KVM_CAP_SET_LINT1:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3264,6 +3265,13 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 
 		goto out;
 	}
+	case KVM_SET_LINT1: {
+		r = -EINVAL;
+		if (!irqchip_in_kernel(vcpu->kvm))
+			goto out;
+		r = 0;
+		kvm_apic_lint1_deliver(vcpu);
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index aace6b8..11a2c42 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -554,6 +554,7 @@  struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA	65
 #define KVM_CAP_S390_GMAP 71
+#define KVM_CAP_SET_LINT1 72
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -759,6 +760,8 @@  struct kvm_clock_data {
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
+/* Available with KVM_CAP_SET_LINT1 for x86 */
+#define KVM_SET_LINT1		  _IO(KVMIO,   0xaa)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)