diff mbox

[1/1,V5] kernel/kvm: introduce KVM_SET_LINT1 and fix improper nmi emulation

Message ID 4E97FAC7.6080007@cn.fujitsu.com
State New
Headers show

Commit Message

Lai Jiangshan Oct. 14, 2011, 9:03 a.m. UTC
Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
button event happens. This doesn't properly emulate real hardware on
which NMI button event triggers LINT1. Because of this, NMI is sent to
the processor even when LINT1 is masked in LVT. For example, this
causes the problem that kdump initiated by NMI sometimes doesn't work
on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.

With this patch, we introduce introduce KVM_SET_LINT1,
and we can use KVM_SET_LINT1 to correctly emulate NMI button
without change the old KVM_NMI behavior.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
---
 arch/x86/include/asm/kvm.h |    1 +
 arch/x86/kvm/irq.h         |    1 +
 arch/x86/kvm/lapic.c       |    7 +++++++
 arch/x86/kvm/x86.c         |    8 ++++++++
 include/linux/kvm.h        |    5 +++++
 5 files changed, 22 insertions(+), 0 deletions(-)

Comments

Jan Kiszka Oct. 14, 2011, 9:07 a.m. UTC | #1
On 2011-10-14 11:03, Lai Jiangshan wrote:
> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
> button event happens. This doesn't properly emulate real hardware on
> which NMI button event triggers LINT1. Because of this, NMI is sent to
> the processor even when LINT1 is masked in LVT. For example, this
> causes the problem that kdump initiated by NMI sometimes doesn't work
> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
> 
> With this patch, we introduce introduce KVM_SET_LINT1,
> and we can use KVM_SET_LINT1 to correctly emulate NMI button
> without change the old KVM_NMI behavior.
> 
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
> ---
>  arch/x86/include/asm/kvm.h |    1 +
>  arch/x86/kvm/irq.h         |    1 +
>  arch/x86/kvm/lapic.c       |    7 +++++++
>  arch/x86/kvm/x86.c         |    8 ++++++++
>  include/linux/kvm.h        |    5 +++++
>  5 files changed, 22 insertions(+), 0 deletions(-)
> diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
> index 4d8dcbd..88d0ac3 100644
> --- a/arch/x86/include/asm/kvm.h
> +++ b/arch/x86/include/asm/kvm.h
> @@ -24,6 +24,7 @@
>  #define __KVM_HAVE_DEBUGREGS
>  #define __KVM_HAVE_XSAVE
>  #define __KVM_HAVE_XCRS
> +#define __KVM_HAVE_SET_LINT1
>  
>  /* Architectural interrupt line count. */
>  #define KVM_NR_INTERRUPTS 256
> diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
> index 53e2d08..0c96315 100644
> --- a/arch/x86/kvm/irq.h
> +++ b/arch/x86/kvm/irq.h
> @@ -95,6 +95,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
>  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
>  void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu);
>  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
>  void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
>  void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 57dcbd4..87fe36a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -1039,6 +1039,13 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
>  		kvm_apic_local_deliver(apic, APIC_LVT0);
>  }
>  
> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +
> +	kvm_apic_local_deliver(apic, APIC_LVT1);
> +}
> +
>  static struct kvm_timer_ops lapic_timer_ops = {
>  	.is_periodic = lapic_is_periodic,
>  };
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 84a28ea..fccd094 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2077,6 +2077,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_XSAVE:
>  	case KVM_CAP_ASYNC_PF:
>  	case KVM_CAP_GET_TSC_KHZ:
> +	case KVM_CAP_SET_LINT1:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> @@ -3264,6 +3265,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  
>  		goto out;
>  	}
> +	case KVM_SET_LINT1: {
> +		r = -EINVAL;
> +		if (!irqchip_in_kernel(vcpu->kvm))
> +			goto out;
> +		r = 0;
> +		kvm_apic_lint1_deliver(vcpu);
> +	}
>  	default:
>  		r = -EINVAL;
>  	}
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index aace6b8..3a10572 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -554,6 +554,9 @@ struct kvm_ppc_pvinfo {
>  #define KVM_CAP_PPC_SMT 64
>  #define KVM_CAP_PPC_RMA	65
>  #define KVM_CAP_S390_GMAP 71
> +#ifdef __KVM_HAVE_SET_LINT1
> +#define KVM_CAP_SET_LINT1 72
> +#endif

Actually, there is no need for __KVM_HAVE_SET_LINT1 and #ifdef. User
land will just do a runtime check.

Jan
Lai Jiangshan Oct. 14, 2011, 9:27 a.m. UTC | #2
On 10/14/2011 05:07 PM, Jan Kiszka wrote:
> On 2011-10-14 11:03, Lai Jiangshan wrote:
>> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
>> button event happens. This doesn't properly emulate real hardware on
>> which NMI button event triggers LINT1. Because of this, NMI is sent to
>> the processor even when LINT1 is masked in LVT. For example, this
>> causes the problem that kdump initiated by NMI sometimes doesn't work
>> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
>>
>> With this patch, we introduce introduce KVM_SET_LINT1,
>> and we can use KVM_SET_LINT1 to correctly emulate NMI button
>> without change the old KVM_NMI behavior.
>>
>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>> Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
>> ---
>>  arch/x86/include/asm/kvm.h |    1 +
>>  arch/x86/kvm/irq.h         |    1 +
>>  arch/x86/kvm/lapic.c       |    7 +++++++
>>  arch/x86/kvm/x86.c         |    8 ++++++++
>>  include/linux/kvm.h        |    5 +++++
>>  5 files changed, 22 insertions(+), 0 deletions(-)
>> diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
>> index 4d8dcbd..88d0ac3 100644
>> --- a/arch/x86/include/asm/kvm.h
>> +++ b/arch/x86/include/asm/kvm.h
>> @@ -24,6 +24,7 @@
>>  #define __KVM_HAVE_DEBUGREGS
>>  #define __KVM_HAVE_XSAVE
>>  #define __KVM_HAVE_XCRS
>> +#define __KVM_HAVE_SET_LINT1
>>  
>>  /* Architectural interrupt line count. */
>>  #define KVM_NR_INTERRUPTS 256
>> diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
>> index 53e2d08..0c96315 100644
>> --- a/arch/x86/kvm/irq.h
>> +++ b/arch/x86/kvm/irq.h
>> @@ -95,6 +95,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
>>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
>>  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
>>  void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
>> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu);
>>  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
>>  void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
>>  void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 57dcbd4..87fe36a 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -1039,6 +1039,13 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
>>  		kvm_apic_local_deliver(apic, APIC_LVT0);
>>  }
>>  
>> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu)
>> +{
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +
>> +	kvm_apic_local_deliver(apic, APIC_LVT1);
>> +}
>> +
>>  static struct kvm_timer_ops lapic_timer_ops = {
>>  	.is_periodic = lapic_is_periodic,
>>  };
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 84a28ea..fccd094 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2077,6 +2077,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>>  	case KVM_CAP_XSAVE:
>>  	case KVM_CAP_ASYNC_PF:
>>  	case KVM_CAP_GET_TSC_KHZ:
>> +	case KVM_CAP_SET_LINT1:
>>  		r = 1;
>>  		break;
>>  	case KVM_CAP_COALESCED_MMIO:
>> @@ -3264,6 +3265,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>>  
>>  		goto out;
>>  	}
>> +	case KVM_SET_LINT1: {
>> +		r = -EINVAL;
>> +		if (!irqchip_in_kernel(vcpu->kvm))
>> +			goto out;
>> +		r = 0;
>> +		kvm_apic_lint1_deliver(vcpu);
>> +	}
>>  	default:
>>  		r = -EINVAL;
>>  	}
>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>> index aace6b8..3a10572 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -554,6 +554,9 @@ struct kvm_ppc_pvinfo {
>>  #define KVM_CAP_PPC_SMT 64
>>  #define KVM_CAP_PPC_RMA	65
>>  #define KVM_CAP_S390_GMAP 71
>> +#ifdef __KVM_HAVE_SET_LINT1
>> +#define KVM_CAP_SET_LINT1 72
>> +#endif
> 
> Actually, there is no need for __KVM_HAVE_SET_LINT1 and #ifdef. User
> land will just do a runtime check.
> 
>

There is not bad result brought by __KVM_HAVE_SET_LINT1
and help for compile time check.

Thanks,
Lai
Jan Kiszka Oct. 14, 2011, 9:32 a.m. UTC | #3
On 2011-10-14 11:27, Lai Jiangshan wrote:
> On 10/14/2011 05:07 PM, Jan Kiszka wrote:
>> On 2011-10-14 11:03, Lai Jiangshan wrote:
>>> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
>>> button event happens. This doesn't properly emulate real hardware on
>>> which NMI button event triggers LINT1. Because of this, NMI is sent to
>>> the processor even when LINT1 is masked in LVT. For example, this
>>> causes the problem that kdump initiated by NMI sometimes doesn't work
>>> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
>>>
>>> With this patch, we introduce introduce KVM_SET_LINT1,
>>> and we can use KVM_SET_LINT1 to correctly emulate NMI button
>>> without change the old KVM_NMI behavior.
>>>
>>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>>> Reported-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
>>> ---
>>>  arch/x86/include/asm/kvm.h |    1 +
>>>  arch/x86/kvm/irq.h         |    1 +
>>>  arch/x86/kvm/lapic.c       |    7 +++++++
>>>  arch/x86/kvm/x86.c         |    8 ++++++++
>>>  include/linux/kvm.h        |    5 +++++
>>>  5 files changed, 22 insertions(+), 0 deletions(-)
>>> diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
>>> index 4d8dcbd..88d0ac3 100644
>>> --- a/arch/x86/include/asm/kvm.h
>>> +++ b/arch/x86/include/asm/kvm.h
>>> @@ -24,6 +24,7 @@
>>>  #define __KVM_HAVE_DEBUGREGS
>>>  #define __KVM_HAVE_XSAVE
>>>  #define __KVM_HAVE_XCRS
>>> +#define __KVM_HAVE_SET_LINT1
>>>  
>>>  /* Architectural interrupt line count. */
>>>  #define KVM_NR_INTERRUPTS 256
>>> diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
>>> index 53e2d08..0c96315 100644
>>> --- a/arch/x86/kvm/irq.h
>>> +++ b/arch/x86/kvm/irq.h
>>> @@ -95,6 +95,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
>>>  void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
>>>  void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
>>>  void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
>>> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu);
>>>  void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
>>>  void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
>>>  void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>>> index 57dcbd4..87fe36a 100644
>>> --- a/arch/x86/kvm/lapic.c
>>> +++ b/arch/x86/kvm/lapic.c
>>> @@ -1039,6 +1039,13 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
>>>  		kvm_apic_local_deliver(apic, APIC_LVT0);
>>>  }
>>>  
>>> +void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu)
>>> +{
>>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>>> +
>>> +	kvm_apic_local_deliver(apic, APIC_LVT1);
>>> +}
>>> +
>>>  static struct kvm_timer_ops lapic_timer_ops = {
>>>  	.is_periodic = lapic_is_periodic,
>>>  };
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> index 84a28ea..fccd094 100644
>>> --- a/arch/x86/kvm/x86.c
>>> +++ b/arch/x86/kvm/x86.c
>>> @@ -2077,6 +2077,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>>>  	case KVM_CAP_XSAVE:
>>>  	case KVM_CAP_ASYNC_PF:
>>>  	case KVM_CAP_GET_TSC_KHZ:
>>> +	case KVM_CAP_SET_LINT1:
>>>  		r = 1;
>>>  		break;
>>>  	case KVM_CAP_COALESCED_MMIO:
>>> @@ -3264,6 +3265,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>>>  
>>>  		goto out;
>>>  	}
>>> +	case KVM_SET_LINT1: {
>>> +		r = -EINVAL;
>>> +		if (!irqchip_in_kernel(vcpu->kvm))
>>> +			goto out;
>>> +		r = 0;
>>> +		kvm_apic_lint1_deliver(vcpu);
>>> +	}
>>>  	default:
>>>  		r = -EINVAL;
>>>  	}
>>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
>>> index aace6b8..3a10572 100644
>>> --- a/include/linux/kvm.h
>>> +++ b/include/linux/kvm.h
>>> @@ -554,6 +554,9 @@ struct kvm_ppc_pvinfo {
>>>  #define KVM_CAP_PPC_SMT 64
>>>  #define KVM_CAP_PPC_RMA	65
>>>  #define KVM_CAP_S390_GMAP 71
>>> +#ifdef __KVM_HAVE_SET_LINT1
>>> +#define KVM_CAP_SET_LINT1 72
>>> +#endif
>>
>> Actually, there is no need for __KVM_HAVE_SET_LINT1 and #ifdef. User
>> land will just do a runtime check.
>>
>>
> 
> There is not bad result brought by __KVM_HAVE_SET_LINT1
> and help for compile time check.

It's guarding an arch-specific CAP that will only be checked if there is
a need. That's in contrast to generic features that are no supported for
all archs (like __KVM_HAVE_GUEST_DEBUG -> KVM_CAP_SET_GUEST_DEBUG).
Granted, there are quite a few examples for redundant __KVM_HAVE/#ifdef
KVM_CAP in the KVM header, but let's not add more.

Jan
Avi Kivity Oct. 16, 2011, 9:39 a.m. UTC | #4
On 10/14/2011 11:03 AM, Lai Jiangshan wrote:
> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
> button event happens. This doesn't properly emulate real hardware on
> which NMI button event triggers LINT1. Because of this, NMI is sent to
> the processor even when LINT1 is masked in LVT. For example, this
> causes the problem that kdump initiated by NMI sometimes doesn't work
> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
>
> With this patch, we introduce introduce KVM_SET_LINT1,
> and we can use KVM_SET_LINT1 to correctly emulate NMI button
> without change the old KVM_NMI behavior.
>  
> @@ -759,6 +762,8 @@ struct kvm_clock_data {
>  #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
>  /* Available with KVM_CAP_RMA */
>  #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
> +/* Available with KVM_CAP_SET_LINT1 for x86 */
> +#define KVM_SET_LINT1		  _IO(KVMIO,   0xaa)
>  
>

LINT1 may have been programmed as a level -triggered interrupt instead
of edge triggered (NMI or interrupt).  We can use the ioctl argument for
the level (and pressing the NMI button needs to pulse the level to 1 and
back to 0).
Lai Jiangshan Oct. 17, 2011, 9:17 a.m. UTC | #5
On 10/16/2011 05:39 PM, Avi Kivity wrote:
> On 10/14/2011 11:03 AM, Lai Jiangshan wrote:
>> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
>> button event happens. This doesn't properly emulate real hardware on
>> which NMI button event triggers LINT1. Because of this, NMI is sent to
>> the processor even when LINT1 is masked in LVT. For example, this
>> causes the problem that kdump initiated by NMI sometimes doesn't work
>> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
>>
>> With this patch, we introduce introduce KVM_SET_LINT1,
>> and we can use KVM_SET_LINT1 to correctly emulate NMI button
>> without change the old KVM_NMI behavior.
>>  
>> @@ -759,6 +762,8 @@ struct kvm_clock_data {
>>  #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
>>  /* Available with KVM_CAP_RMA */
>>  #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
>> +/* Available with KVM_CAP_SET_LINT1 for x86 */
>> +#define KVM_SET_LINT1		  _IO(KVMIO,   0xaa)
>>  
>>
> 
> LINT1 may have been programmed as a level -triggered interrupt instead
> of edge triggered (NMI or interrupt).  We can use the ioctl argument for
> the level (and pressing the NMI button needs to pulse the level to 1 and
> back to 0).
> 

Hi, Avi,

How to handle level=0 in the kernel?
Or just ignore it?

Thanks,
Lai
Lai Jiangshan Oct. 17, 2011, 9:40 a.m. UTC | #6
On 10/16/2011 05:39 PM, Avi Kivity wrote:
> On 10/14/2011 11:03 AM, Lai Jiangshan wrote:
>> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
>> button event happens. This doesn't properly emulate real hardware on
>> which NMI button event triggers LINT1. Because of this, NMI is sent to
>> the processor even when LINT1 is masked in LVT. For example, this
>> causes the problem that kdump initiated by NMI sometimes doesn't work
>> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
>>
>> With this patch, we introduce introduce KVM_SET_LINT1,
>> and we can use KVM_SET_LINT1 to correctly emulate NMI button
>> without change the old KVM_NMI behavior.
>>  
>> @@ -759,6 +762,8 @@ struct kvm_clock_data {
>>  #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
>>  /* Available with KVM_CAP_RMA */
>>  #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
>> +/* Available with KVM_CAP_SET_LINT1 for x86 */
>> +#define KVM_SET_LINT1		  _IO(KVMIO,   0xaa)
>>  
>>
> 
> LINT1 may have been programmed as a level -triggered interrupt instead
> of edge triggered (NMI or interrupt).  We can use the ioctl argument for
> the level (and pressing the NMI button needs to pulse the level to 1 and
> back to 0).
> 

Hi, Avi, Jan,

Which approach you prefer to?
I need to know the result before wasting too much time to respin
the approach.

1) Fix KVM_NMI emulation approach  (which is v3 patchset)
	- It directly fixes the problem and matches the
	  real hard ware more, but it changes KVM_NMI bahavior.
	- Require both kernel-site and userspace-site fix.

2) Get the LAPIC state from kernel irqchip, and inject NMI if it is allowed
   (which is v4 patchset)
	- Simple, don't changes any kernel behavior.
	- Only need the userspace-site fix

3) Add KVM_SET_LINT1 approach (which is v5 patchset)
	- don't changes the kernel's KVM_NMI behavior.
	- much complex
	- Require both kernel-site and userspace-site fix.
	- userspace-site should also handle the !KVM_SET_LINT1
	  condition, it uses all the 2) approach' code. it means
	  this approach equals the 2) approach + KVM_SET_LINT1 ioctl.

This is an urgent bug of us, we need to settle it down soon.

Thanks,
Lai
Avi Kivity Oct. 17, 2011, 9:49 a.m. UTC | #7
On 10/17/2011 11:40 AM, Lai Jiangshan wrote:
> >>
> > 
> > LINT1 may have been programmed as a level -triggered interrupt instead
> > of edge triggered (NMI or interrupt).  We can use the ioctl argument for
> > the level (and pressing the NMI button needs to pulse the level to 1 and
> > back to 0).
> > 
>
> Hi, Avi, Jan,
>
> Which approach you prefer to?
> I need to know the result before wasting too much time to respin
> the approach.

Yes, sorry about the slow and sometimes conflicting feedback.

> 1) Fix KVM_NMI emulation approach  (which is v3 patchset)
> 	- It directly fixes the problem and matches the
> 	  real hard ware more, but it changes KVM_NMI bahavior.
> 	- Require both kernel-site and userspace-site fix.
>
> 2) Get the LAPIC state from kernel irqchip, and inject NMI if it is allowed
>    (which is v4 patchset)
> 	- Simple, don't changes any kernel behavior.
> 	- Only need the userspace-site fix
>
> 3) Add KVM_SET_LINT1 approach (which is v5 patchset)
> 	- don't changes the kernel's KVM_NMI behavior.
> 	- much complex
> 	- Require both kernel-site and userspace-site fix.
> 	- userspace-site should also handle the !KVM_SET_LINT1
> 	  condition, it uses all the 2) approach' code. it means
> 	  this approach equals the 2) approach + KVM_SET_LINT1 ioctl.
>
> This is an urgent bug of us, we need to settle it down soo

While (1) is simple, it overloads a single ioctl with two meanings,
that's not so good.

Whether we do (1) or (3), we need (2) as well, for older kernels.

So I recommend first focusing on (2) and merging it, then doing (3).

(note an additional issue with 3 is whether to make it a vm or vcpu
ioctl - we've been assuming vcpu ioctl but it's not necessarily the best
choice).
Avi Kivity Oct. 17, 2011, 9:54 a.m. UTC | #8
On 10/17/2011 11:17 AM, Lai Jiangshan wrote:
> On 10/16/2011 05:39 PM, Avi Kivity wrote:
> > On 10/14/2011 11:03 AM, Lai Jiangshan wrote:
> >> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
> >> button event happens. This doesn't properly emulate real hardware on
> >> which NMI button event triggers LINT1. Because of this, NMI is sent to
> >> the processor even when LINT1 is masked in LVT. For example, this
> >> causes the problem that kdump initiated by NMI sometimes doesn't work
> >> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
> >>
> >> With this patch, we introduce introduce KVM_SET_LINT1,
> >> and we can use KVM_SET_LINT1 to correctly emulate NMI button
> >> without change the old KVM_NMI behavior.
> >>  
> >> @@ -759,6 +762,8 @@ struct kvm_clock_data {
> >>  #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
> >>  /* Available with KVM_CAP_RMA */
> >>  #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
> >> +/* Available with KVM_CAP_SET_LINT1 for x86 */
> >> +#define KVM_SET_LINT1		  _IO(KVMIO,   0xaa)
> >>  
> >>
> > 
> > LINT1 may have been programmed as a level -triggered interrupt instead
> > of edge triggered (NMI or interrupt).  We can use the ioctl argument for
> > the level (and pressing the NMI button needs to pulse the level to 1 and
> > back to 0).
> > 
>
> Hi, Avi,
>
> How to handle level=0 in the kernel?
> Or just ignore it?

It needs to be handled according to the delivery mode, polarity, and
trigger mode bits in the LVT.

For example, a Fixed delivery mode with polarity 1 and level trigger
mode will post the interrupt as long as it is in level 0 and not masked
by the ISR.  __apic_accept_irq() should handle this.
Jan Kiszka Oct. 17, 2011, 10:21 a.m. UTC | #9
On 2011-10-17 11:54, Avi Kivity wrote:
> On 10/17/2011 11:17 AM, Lai Jiangshan wrote:
>> On 10/16/2011 05:39 PM, Avi Kivity wrote:
>>> On 10/14/2011 11:03 AM, Lai Jiangshan wrote:
>>>> Currently, NMI interrupt is blindly sent to all the vCPUs when NMI
>>>> button event happens. This doesn't properly emulate real hardware on
>>>> which NMI button event triggers LINT1. Because of this, NMI is sent to
>>>> the processor even when LINT1 is masked in LVT. For example, this
>>>> causes the problem that kdump initiated by NMI sometimes doesn't work
>>>> on KVM, because kdump assumes NMI is masked on CPUs other than CPU0.
>>>>
>>>> With this patch, we introduce introduce KVM_SET_LINT1,
>>>> and we can use KVM_SET_LINT1 to correctly emulate NMI button
>>>> without change the old KVM_NMI behavior.
>>>>  
>>>> @@ -759,6 +762,8 @@ struct kvm_clock_data {
>>>>  #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
>>>>  /* Available with KVM_CAP_RMA */
>>>>  #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
>>>> +/* Available with KVM_CAP_SET_LINT1 for x86 */
>>>> +#define KVM_SET_LINT1		  _IO(KVMIO,   0xaa)
>>>>  
>>>>
>>>
>>> LINT1 may have been programmed as a level -triggered interrupt instead
>>> of edge triggered (NMI or interrupt).  We can use the ioctl argument for
>>> the level (and pressing the NMI button needs to pulse the level to 1 and
>>> back to 0).
>>>
>>
>> Hi, Avi,
>>
>> How to handle level=0 in the kernel?
>> Or just ignore it?
> 
> It needs to be handled according to the delivery mode, polarity, and
> trigger mode bits in the LVT.
> 
> For example, a Fixed delivery mode with polarity 1 and level trigger
> mode will post the interrupt as long as it is in level 0 and not masked
> by the ISR.  __apic_accept_irq() should handle this.

But I think it's not yet fully prepared for this (level is only
considered for APIC_DM_INIT e.g.).

Jan
diff mbox

Patch

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4d8dcbd..88d0ac3 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -24,6 +24,7 @@ 
 #define __KVM_HAVE_DEBUGREGS
 #define __KVM_HAVE_XSAVE
 #define __KVM_HAVE_XCRS
+#define __KVM_HAVE_SET_LINT1
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 53e2d08..0c96315 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -95,6 +95,7 @@  void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
+void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57dcbd4..87fe36a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1039,6 +1039,13 @@  void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
+void kvm_apic_lint1_deliver(struct kvm_vcpu *vcpu)
+{
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	kvm_apic_local_deliver(apic, APIC_LVT1);
+}
+
 static struct kvm_timer_ops lapic_timer_ops = {
 	.is_periodic = lapic_is_periodic,
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84a28ea..fccd094 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2077,6 +2077,7 @@  int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
+	case KVM_CAP_SET_LINT1:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3264,6 +3265,13 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 
 		goto out;
 	}
+	case KVM_SET_LINT1: {
+		r = -EINVAL;
+		if (!irqchip_in_kernel(vcpu->kvm))
+			goto out;
+		r = 0;
+		kvm_apic_lint1_deliver(vcpu);
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index aace6b8..3a10572 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -554,6 +554,9 @@  struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA	65
 #define KVM_CAP_S390_GMAP 71
+#ifdef __KVM_HAVE_SET_LINT1
+#define KVM_CAP_SET_LINT1 72
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -759,6 +762,8 @@  struct kvm_clock_data {
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
+/* Available with KVM_CAP_SET_LINT1 for x86 */
+#define KVM_SET_LINT1		  _IO(KVMIO,   0xaa)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)