diff mbox

rtc: introduce nmi disable bit handler for cmos

Message ID 1449926146-14828-1-git-send-email-arei.gonglei@huawei.com
State New
Headers show

Commit Message

Gonglei (Arei) Dec. 12, 2015, 1:15 p.m. UTC
The Non-Maskable Interrupt (NMI) Enable bit is 0x80 bit of
Port 0x70 (and its aliases). This bit must be 0b to enable
the hardware chipset to send a Non-Maskable Interrupt. When
set to a 1b, NMI's are disabled. This bit is commonly accessed
by applications, BIOS, and even the operating system since it is
used to block NMI assertions when sensitive code is executing.

Currently, QEMU do no not handle the bit, means Qemu cannot
block NMI occur, sometimes maybe cause a race between the CMOS
read/write and the NMI handler. If you are setting the CMOS clock
or reading CMOS RAM and an NMI occurs, Bad values could be written
to or read from the CMOS RAM, or the NMI operation might not
occur correctly.

This patch introduce nmi disable bit handler to fix the problem
and make the emulated CMOS like the real hardware.

Signed-off-by: Gonglei <arei.gonglei@huawei.com>
---
 Please refer to:
   https://lists.gnu.org/archive/html/qemu-devel/2015-11/msg00616.html
 
 Note: We can't reproduce the problem, what a pity :(
 I holp the patch can fix it. Please review, thanks!
---
 hw/i386/kvm/apic.c                  |  4 +++-
 hw/timer/mc146818rtc.c              | 11 +++++++++++
 include/hw/timer/mc146818rtc_regs.h |  3 +++
 include/sysemu/sysemu.h             |  1 +
 target-i386/kvm.c                   |  4 ++--
 vl.c                                |  1 +
 6 files changed, 21 insertions(+), 3 deletions(-)

Comments

Paolo Bonzini Dec. 14, 2015, 9:56 a.m. UTC | #1
On 12/12/2015 14:15, Gonglei wrote:
> The Non-Maskable Interrupt (NMI) Enable bit is 0x80 bit of
> Port 0x70 (and its aliases). This bit must be 0b to enable
> the hardware chipset to send a Non-Maskable Interrupt. When
> set to a 1b, NMI's are disabled. This bit is commonly accessed
> by applications, BIOS, and even the operating system since it is
> used to block NMI assertions when sensitive code is executing.
> 
> Currently, QEMU do no not handle the bit, means Qemu cannot
> block NMI occur, sometimes maybe cause a race between the CMOS
> read/write and the NMI handler. If you are setting the CMOS clock
> or reading CMOS RAM and an NMI occurs, Bad values could be written
> to or read from the CMOS RAM, or the NMI operation might not
> occur correctly.
> 
> This patch introduce nmi disable bit handler to fix the problem
> and make the emulated CMOS like the real hardware.

I think that this only works with -machine kernel_irqchip=off, however.
 You would have to add a new bit to struct kvm_vcpu_events, which could
for example replace nmi.pad.

>  Please refer to:
>    https://lists.gnu.org/archive/html/qemu-devel/2015-11/msg00616.html
>  
>  Note: We can't reproduce the problem, what a pity :(
>  I holp the patch can fix it. Please review, thanks!

The effect of the patch could be tested with kvm-unit-tests.

Thanks,

Paolo

>  hw/i386/kvm/apic.c                  |  4 +++-
>  hw/timer/mc146818rtc.c              | 11 +++++++++++
>  include/hw/timer/mc146818rtc_regs.h |  3 +++
>  include/sysemu/sysemu.h             |  1 +
>  target-i386/kvm.c                   |  4 ++--
>  vl.c                                |  1 +
>  6 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
> index 5b47056..deea49f 100644
> --- a/hw/i386/kvm/apic.c
> +++ b/hw/i386/kvm/apic.c
> @@ -12,6 +12,7 @@
>  #include "hw/i386/apic_internal.h"
>  #include "hw/pci/msi.h"
>  #include "sysemu/kvm.h"
> +#include "sysemu/sysemu.h"
>  
>  static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic,
>                                      int reg_id, uint32_t val)
> @@ -132,7 +133,8 @@ static void do_inject_external_nmi(void *data)
>      cpu_synchronize_state(cpu);
>  
>      lvt = s->lvt[APIC_LVT_LINT1];
> -    if (!(lvt & APIC_LVT_MASKED) && ((lvt >> 8) & 7) == APIC_DM_NMI) {
> +    if (!nmi_disabled && (lvt & APIC_LVT_MASKED)
> +        && ((lvt >> 8) & 7) == APIC_DM_NMI) {
>          ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
>          if (ret < 0) {
>              fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
> diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c
> index a9f0efd..aaa8b4e 100644
> --- a/hw/timer/mc146818rtc.c
> +++ b/hw/timer/mc146818rtc.c
> @@ -389,6 +389,17 @@ static void cmos_ioport_write(void *opaque, hwaddr addr,
>      RTCState *s = opaque;
>  
>      if ((addr & 1) == 0) {
> +        /*
> +         * The Non-Maskable Interrupt (NMI) Enable bit is 0x80 bit of
> +         * Port 0x70 (and its aliases). This bit must be 0b to enable
> +         * the hardware chipset to send a Non-Maskable Interrupt. When
> +         * set to a 1b, NMI's are disabled. This bit is commonly accessed
> +         * by applications, BIOS, and even the operating system since it is
> +         * used to block NMI assertions when sensitive code is executing.
> +         */
> +        nmi_disabled = !!(data & NMI_DISABLE_BIT);
> +        CMOS_DPRINTF("cmos: nmi_disabled=%s\n",
> +                     nmi_disabled ? "true" : "false");
>          s->cmos_index = data & 0x7f;
>      } else {
>          CMOS_DPRINTF("cmos: write index=0x%02x val=0x%02" PRIx64 "\n",
> diff --git a/include/hw/timer/mc146818rtc_regs.h b/include/hw/timer/mc146818rtc_regs.h
> index ccdee42..175249f 100644
> --- a/include/hw/timer/mc146818rtc_regs.h
> +++ b/include/hw/timer/mc146818rtc_regs.h
> @@ -64,4 +64,7 @@
>  #define REG_C_AF   0x20
>  #define REG_C_MASK 0x70
>  
> +/* PORT_CMOS_INDEX nmi disable bit */
> +#define NMI_DISABLE_BIT 0x80
> +
>  #endif
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 3bb8897..a5b2342 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -177,6 +177,7 @@ extern uint8_t qemu_extra_params_fw[2];
>  extern QEMUClockType rtc_clock;
>  extern const char *mem_path;
>  extern int mem_prealloc;
> +extern bool nmi_disabled;
>  
>  #define MAX_NODES 128
>  #define NUMA_NODE_UNASSIGNED MAX_NODES
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 6dc9846..abbd65b 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -2456,9 +2456,9 @@ void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
>      CPUX86State *env = &x86_cpu->env;
>      int ret;
>  
> -    /* Inject NMI */
> +    /* Inject NMI Or SMI */
>      if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
> -        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
> +        if (!nmi_disabled && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
>              qemu_mutex_lock_iothread();
>              cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
>              qemu_mutex_unlock_iothread();
> diff --git a/vl.c b/vl.c
> index 4211ff1..ff5b06f 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -185,6 +185,7 @@ bool boot_strict;
>  uint8_t *boot_splash_filedata;
>  size_t boot_splash_filedata_size;
>  uint8_t qemu_extra_params_fw[2];
> +bool nmi_disabled;
>  
>  int icount_align_option;
>  
>
Gonglei (Arei) Dec. 14, 2015, 12:49 p.m. UTC | #2
Hi Paolo,

Thanks for your comments firstly.

> Subject: Re: [PATCH] rtc: introduce nmi disable bit handler for cmos
> 
> 
> 
> On 12/12/2015 14:15, Gonglei wrote:
> > The Non-Maskable Interrupt (NMI) Enable bit is 0x80 bit of Port 0x70
> > (and its aliases). This bit must be 0b to enable the hardware chipset
> > to send a Non-Maskable Interrupt. When set to a 1b, NMI's are
> > disabled. This bit is commonly accessed by applications, BIOS, and
> > even the operating system since it is used to block NMI assertions
> > when sensitive code is executing.
> >
> > Currently, QEMU do no not handle the bit, means Qemu cannot block NMI
> > occur, sometimes maybe cause a race between the CMOS read/write and
> > the NMI handler. If you are setting the CMOS clock or reading CMOS RAM
> > and an NMI occurs, Bad values could be written to or read from the
> > CMOS RAM, or the NMI operation might not occur correctly.
> >
> > This patch introduce nmi disable bit handler to fix the problem and
> > make the emulated CMOS like the real hardware.
> 
> I think that this only works with -machine kernel_irqchip=off, however.

IIRCC, the kernel_irqchip is disabled by default, and we used the default value.

>  You would have to add a new bit to struct kvm_vcpu_events, which could for
> example replace nmi.pad.
> 
You mean we should keep the value of nmi_disabled when we want to live migration?
Or do you have other meaning? I'm not familiar with this, sorry about that. :(

> >  Please refer to:
> >    https://lists.gnu.org/archive/html/qemu-devel/2015-11/msg00616.html
> >
> >  Note: We can't reproduce the problem, what a pity :(  I holp the
> > patch can fix it. Please review, thanks!
> 
> The effect of the patch could be tested with kvm-unit-tests.
> 

I'll test this version with kvm-unit-tests.



Regards,
-Gonglei

> Thanks,
> 
> Paolo
> 
> >  hw/i386/kvm/apic.c                  |  4 +++-
> >  hw/timer/mc146818rtc.c              | 11 +++++++++++
> >  include/hw/timer/mc146818rtc_regs.h |  3 +++
> >  include/sysemu/sysemu.h             |  1 +
> >  target-i386/kvm.c                   |  4 ++--
> >  vl.c                                |  1 +
> >  6 files changed, 21 insertions(+), 3 deletions(-)
> >
> > diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c index
> > 5b47056..deea49f 100644
> > --- a/hw/i386/kvm/apic.c
> > +++ b/hw/i386/kvm/apic.c
> > @@ -12,6 +12,7 @@
> >  #include "hw/i386/apic_internal.h"
> >  #include "hw/pci/msi.h"
> >  #include "sysemu/kvm.h"
> > +#include "sysemu/sysemu.h"
> >
> >  static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic,
> >                                      int reg_id, uint32_t val) @@
> > -132,7 +133,8 @@ static void do_inject_external_nmi(void *data)
> >      cpu_synchronize_state(cpu);
> >
> >      lvt = s->lvt[APIC_LVT_LINT1];
> > -    if (!(lvt & APIC_LVT_MASKED) && ((lvt >> 8) & 7) == APIC_DM_NMI) {
> > +    if (!nmi_disabled && (lvt & APIC_LVT_MASKED)
> > +        && ((lvt >> 8) & 7) == APIC_DM_NMI) {
> >          ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
> >          if (ret < 0) {
> >              fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
> > diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c index
> > a9f0efd..aaa8b4e 100644
> > --- a/hw/timer/mc146818rtc.c
> > +++ b/hw/timer/mc146818rtc.c
> > @@ -389,6 +389,17 @@ static void cmos_ioport_write(void *opaque,
> hwaddr addr,
> >      RTCState *s = opaque;
> >
> >      if ((addr & 1) == 0) {
> > +        /*
> > +         * The Non-Maskable Interrupt (NMI) Enable bit is 0x80 bit of
> > +         * Port 0x70 (and its aliases). This bit must be 0b to enable
> > +         * the hardware chipset to send a Non-Maskable Interrupt. When
> > +         * set to a 1b, NMI's are disabled. This bit is commonly accessed
> > +         * by applications, BIOS, and even the operating system since it is
> > +         * used to block NMI assertions when sensitive code is executing.
> > +         */
> > +        nmi_disabled = !!(data & NMI_DISABLE_BIT);
> > +        CMOS_DPRINTF("cmos: nmi_disabled=%s\n",
> > +                     nmi_disabled ? "true" : "false");
> >          s->cmos_index = data & 0x7f;
> >      } else {
> >          CMOS_DPRINTF("cmos: write index=0x%02x val=0x%02" PRIx64
> > "\n", diff --git a/include/hw/timer/mc146818rtc_regs.h
> > b/include/hw/timer/mc146818rtc_regs.h
> > index ccdee42..175249f 100644
> > --- a/include/hw/timer/mc146818rtc_regs.h
> > +++ b/include/hw/timer/mc146818rtc_regs.h
> > @@ -64,4 +64,7 @@
> >  #define REG_C_AF   0x20
> >  #define REG_C_MASK 0x70
> >
> > +/* PORT_CMOS_INDEX nmi disable bit */ #define NMI_DISABLE_BIT 0x80
> > +
> >  #endif
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index
> > 3bb8897..a5b2342 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -177,6 +177,7 @@ extern uint8_t qemu_extra_params_fw[2];  extern
> > QEMUClockType rtc_clock;  extern const char *mem_path;  extern int
> > mem_prealloc;
> > +extern bool nmi_disabled;
> >
> >  #define MAX_NODES 128
> >  #define NUMA_NODE_UNASSIGNED MAX_NODES diff --git
> a/target-i386/kvm.c
> > b/target-i386/kvm.c index 6dc9846..abbd65b 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -2456,9 +2456,9 @@ void kvm_arch_pre_run(CPUState *cpu, struct
> kvm_run *run)
> >      CPUX86State *env = &x86_cpu->env;
> >      int ret;
> >
> > -    /* Inject NMI */
> > +    /* Inject NMI Or SMI */
> >      if (cpu->interrupt_request & (CPU_INTERRUPT_NMI |
> CPU_INTERRUPT_SMI)) {
> > -        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
> > +        if (!nmi_disabled && (cpu->interrupt_request &
> > + CPU_INTERRUPT_NMI)) {
> >              qemu_mutex_lock_iothread();
> >              cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
> >              qemu_mutex_unlock_iothread(); diff --git a/vl.c b/vl.c
> > index 4211ff1..ff5b06f 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -185,6 +185,7 @@ bool boot_strict;
> >  uint8_t *boot_splash_filedata;
> >  size_t boot_splash_filedata_size;
> >  uint8_t qemu_extra_params_fw[2];
> > +bool nmi_disabled;
> >
> >  int icount_align_option;
> >
> >
Paolo Bonzini Dec. 14, 2015, 12:51 p.m. UTC | #3
On 14/12/2015 13:49, Gonglei (Arei) wrote:
>>> > > This patch introduce nmi disable bit handler to fix the problem and
>>> > > make the emulated CMOS like the real hardware.
>> > 
>> > I think that this only works with -machine kernel_irqchip=off, however.
> IIRCC, the kernel_irqchip is disabled by default, and we used the default value.

No, it's enabled by default.

> >  You would have to add a new bit to struct kvm_vcpu_events, which could for
> > example replace nmi.pad.
> 
> You mean we should keep the value of nmi_disabled when we want to live migration?

Yes.  It can also be used to communicate the enabling/disabling of NMIs
when the RTC is written.

> > >  Please refer to:
> > >    https://lists.gnu.org/archive/html/qemu-devel/2015-11/msg00616.html
> > >
> > >  Note: We can't reproduce the problem, what a pity :(  I holp the
> > > patch can fix it. Please review, thanks!
> > 
> > The effect of the patch could be tested with kvm-unit-tests.
> 
> I'll test this version with kvm-unit-tests.

Note that you'll have to write a new test. :)

Paolo
Gonglei (Arei) Dec. 14, 2015, 1:27 p.m. UTC | #4
> 
> On 14/12/2015 13:49, Gonglei (Arei) wrote:
> >>> > > This patch introduce nmi disable bit handler to fix the problem
> >>> > > and make the emulated CMOS like the real hardware.
> >> >
> >> > I think that this only works with -machine kernel_irqchip=off, however.
> > IIRCC, the kernel_irqchip is disabled by default, and we used the default
> value.
> 
> No, it's enabled by default.
> 

Okay, yes, I saw the source code again. That means kmod finish the NMI injection
wrok, and the NMI will not pass Qemu side. So, you thought this patch cannot block
NMI injection when kernel_irqchip=on ?

Maybe we should pass the nmi_disable bit to Kmod when kernel_irqchip=on , right?

Regards,
-Gonglei

> > >  You would have to add a new bit to struct kvm_vcpu_events, which
> > > could for example replace nmi.pad.
> >
> > You mean we should keep the value of nmi_disabled when we want to live
> migration?
> 
> Yes.  It can also be used to communicate the enabling/disabling of NMIs when
> the RTC is written.
> 
> > > >  Please refer to:
> > > >
> > > > https://lists.gnu.org/archive/html/qemu-devel/2015-11/msg00616.htm
> > > > l
> > > >
> > > >  Note: We can't reproduce the problem, what a pity :(  I holp the
> > > > patch can fix it. Please review, thanks!
> > >
> > > The effect of the patch could be tested with kvm-unit-tests.
> >
> > I'll test this version with kvm-unit-tests.
> 
> Note that you'll have to write a new test. :)
> 
> Paolo
Paolo Bonzini Dec. 14, 2015, 1:37 p.m. UTC | #5
On 14/12/2015 14:27, Gonglei (Arei) wrote:
> 
>>
>> On 14/12/2015 13:49, Gonglei (Arei) wrote:
>>>>>>> This patch introduce nmi disable bit handler to fix the problem
>>>>>>> and make the emulated CMOS like the real hardware.
>>>>>
>>>>> I think that this only works with -machine kernel_irqchip=off, however.
>>> IIRCC, the kernel_irqchip is disabled by default, and we used the default
>> value.
>>
>> No, it's enabled by default.
>>
> 
> Okay, yes, I saw the source code again. That means kmod finish the NMI injection
> wrok, and the NMI will not pass Qemu side. So, you thought this patch cannot block
> NMI injection when kernel_irqchip=on ?

I am not sure.  It depends on which NMIs are blocked by the bit.  For
example, the IOAPIC can deliver NMIs, and they wouldn't be blocked.

Do you have any documentation, to see whether they can actually happen
on emulated hardware?  I guess we support the TCO watchdog, so yes.

> Maybe we should pass the nmi_disable bit to Kmod when kernel_irqchip=on , right?

Yes, that's the idea.

But first of all, I've read the thread you linked, and I couldn't find
the place where it says that the root cause is NMIs.

Paolo
Eduardo Habkost Dec. 14, 2015, 6:16 p.m. UTC | #6
On Sat, Dec 12, 2015 at 09:15:46PM +0800, Gonglei wrote:
> The Non-Maskable Interrupt (NMI) Enable bit is 0x80 bit of
> Port 0x70 (and its aliases). This bit must be 0b to enable
> the hardware chipset to send a Non-Maskable Interrupt. When
> set to a 1b, NMI's are disabled. This bit is commonly accessed
> by applications, BIOS, and even the operating system since it is
> used to block NMI assertions when sensitive code is executing.
> 
> Currently, QEMU do no not handle the bit, means Qemu cannot
> block NMI occur, sometimes maybe cause a race between the CMOS
> read/write and the NMI handler. If you are setting the CMOS clock
> or reading CMOS RAM and an NMI occurs, Bad values could be written
> to or read from the CMOS RAM, or the NMI operation might not
> occur correctly.
> 
> This patch introduce nmi disable bit handler to fix the problem
> and make the emulated CMOS like the real hardware.
> 
> Signed-off-by: Gonglei <arei.gonglei@huawei.com>
> ---
>  Please refer to:
>    https://lists.gnu.org/archive/html/qemu-devel/2015-11/msg00616.html
>  
>  Note: We can't reproduce the problem, what a pity :(
>  I holp the patch can fix it. Please review, thanks!
> ---
>  hw/i386/kvm/apic.c                  |  4 +++-
>  hw/timer/mc146818rtc.c              | 11 +++++++++++
>  include/hw/timer/mc146818rtc_regs.h |  3 +++
>  include/sysemu/sysemu.h             |  1 +
>  target-i386/kvm.c                   |  4 ++--
>  vl.c                                |  1 +
>  6 files changed, 21 insertions(+), 3 deletions(-)
> 
[...]
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 3bb8897..a5b2342 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -177,6 +177,7 @@ extern uint8_t qemu_extra_params_fw[2];
>  extern QEMUClockType rtc_clock;
>  extern const char *mem_path;
>  extern int mem_prealloc;
> +extern bool nmi_disabled;

Please, not another global variable. Doesn't this belong to
struct RTCState or APICCommonState?
Gonglei (Arei) Dec. 15, 2015, 12:58 a.m. UTC | #7
> From: Paolo Bonzini [mailto:pbonzini@redhat.com]
> Sent: Monday, December 14, 2015 9:37 PM
> On 14/12/2015 14:27, Gonglei (Arei) wrote:
> >
> >>
> >> On 14/12/2015 13:49, Gonglei (Arei) wrote:
> >>>>>>> This patch introduce nmi disable bit handler to fix the problem
> >>>>>>> and make the emulated CMOS like the real hardware.
> >>>>>
> >>>>> I think that this only works with -machine kernel_irqchip=off, however.
> >>> IIRCC, the kernel_irqchip is disabled by default, and we used the
> >>> default
> >> value.
> >>
> >> No, it's enabled by default.
> >>
> >
> > Okay, yes, I saw the source code again. That means kmod finish the NMI
> > injection wrok, and the NMI will not pass Qemu side. So, you thought
> > this patch cannot block NMI injection when kernel_irqchip=on ?
> 
> I am not sure.  It depends on which NMIs are blocked by the bit.  For
> example, the IOAPIC can deliver NMIs, and they wouldn't be blocked.
> 
> Do you have any documentation, to see whether they can actually happen on
> emulated hardware?  I guess we support the TCO watchdog, so yes.
> 
Yes, watchdog is one case, and we have another case which need to use NMI to
tell guest do something when guest's cpu stuck or something like that.
And I can invoke qmp command "inject-nmi" when SeaBIOS try to close NMI by
invoking rtc_read() or rtc_write(). 

After the NMI injection, the guest will reboot:

[2015-12-14 16:41:57] In resume (status=0)
[2015-12-14 16:41:57] In 32bit resume
[2015-12-14 16:41:57] =====Attempting a hard reboot====
[2015-12-14 16:41:58] SeaBIOS (version rel-1.8.1-0-g4adadbd-20151214_135833-linux-jAPTBr)

[snip]

So, I think we should handle those scenarios, just like the real hardware.

> > Maybe we should pass the nmi_disable bit to Kmod when kernel_irqchip=on ,
> right?
> 
> Yes, that's the idea.
> 
That means I have much more work need to do.

> But first of all, I've read the thread you linked, and I couldn't find the place
> where it says that the root cause is NMIs.
> 
That's complete true. I haven't direct proof, but I think I eliminated
all possible causes, except NMIs. Of course, if you find any other clues,
please let me know.

The most trouble thing is I couldn't reproduce this problem. :(

Thanks,
-Gonglei
Gonglei (Arei) Dec. 15, 2015, 1 a.m. UTC | #8
> -----Original Message-----
> From: Eduardo Habkost [mailto:ehabkost@redhat.com]
> Sent: Tuesday, December 15, 2015 2:17 AM
> To: Gonglei (Arei)
> Cc: qemu-devel@nongnu.org; pbonzini@redhat.com; rth@twiddle.net;
> kevin@koconnor.net; Huangpeng (Peter)
> Subject: Re: [PATCH] rtc: introduce nmi disable bit handler for cmos
> 
> On Sat, Dec 12, 2015 at 09:15:46PM +0800, Gonglei wrote:
> > The Non-Maskable Interrupt (NMI) Enable bit is 0x80 bit of Port 0x70
> > (and its aliases). This bit must be 0b to enable the hardware chipset
> > to send a Non-Maskable Interrupt. When set to a 1b, NMI's are
> > disabled. This bit is commonly accessed by applications, BIOS, and
> > even the operating system since it is used to block NMI assertions
> > when sensitive code is executing.
> >
> > Currently, QEMU do no not handle the bit, means Qemu cannot block NMI
> > occur, sometimes maybe cause a race between the CMOS read/write and
> > the NMI handler. If you are setting the CMOS clock or reading CMOS RAM
> > and an NMI occurs, Bad values could be written to or read from the
> > CMOS RAM, or the NMI operation might not occur correctly.
> >
> > This patch introduce nmi disable bit handler to fix the problem and
> > make the emulated CMOS like the real hardware.
> >
> > Signed-off-by: Gonglei <arei.gonglei@huawei.com>
> > ---
> >  Please refer to:
> >    https://lists.gnu.org/archive/html/qemu-devel/2015-11/msg00616.html
> >
> >  Note: We can't reproduce the problem, what a pity :(  I holp the
> > patch can fix it. Please review, thanks!
> > ---
> >  hw/i386/kvm/apic.c                  |  4 +++-
> >  hw/timer/mc146818rtc.c              | 11 +++++++++++
> >  include/hw/timer/mc146818rtc_regs.h |  3 +++
> >  include/sysemu/sysemu.h             |  1 +
> >  target-i386/kvm.c                   |  4 ++--
> >  vl.c                                |  1 +
> >  6 files changed, 21 insertions(+), 3 deletions(-)
> >
> [...]
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index
> > 3bb8897..a5b2342 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -177,6 +177,7 @@ extern uint8_t qemu_extra_params_fw[2];  extern
> > QEMUClockType rtc_clock;  extern const char *mem_path;  extern int
> > mem_prealloc;
> > +extern bool nmi_disabled;
> 
> Please, not another global variable. Doesn't this belong to struct RTCState or
> APICCommonState?
> 
OK, I'll think about changing this in the next version, thanks.


Regards,
-Gonglei
Gonglei (Arei) Dec. 15, 2015, 9:34 a.m. UTC | #9
Hi Paolo,

/* for KVM_GET/SET_VCPU_EVENTS */
struct kvm_vcpu_events {
 ...
struct {
		__u8 injected;
		__u8 pending;
		__u8 masked;
		__u8 pad;
	} nmi;
 ...

I found that the nmi.masked property does these enable or disable NMI jobs. 
So, I think we don't need to add a new bit. Right?

Regards,
-Gonglei


> -----Original Message-----
> From: Gonglei (Arei)
> Sent: Tuesday, December 15, 2015 8:58 AM
> To: 'Paolo Bonzini'; qemu-devel@nongnu.org
> Cc: rth@twiddle.net; ehabkost@redhat.com; kevin@koconnor.net; Huangpeng
> (Peter)
> Subject: RE: [PATCH] rtc: introduce nmi disable bit handler for cmos
> 
> > From: Paolo Bonzini [mailto:pbonzini@redhat.com]
> > Sent: Monday, December 14, 2015 9:37 PM
> > On 14/12/2015 14:27, Gonglei (Arei) wrote:
> > >
> > >>
> > >> On 14/12/2015 13:49, Gonglei (Arei) wrote:
> > >>>>>>> This patch introduce nmi disable bit handler to fix the problem
> > >>>>>>> and make the emulated CMOS like the real hardware.
> > >>>>>
> > >>>>> I think that this only works with -machine kernel_irqchip=off, however.
> > >>> IIRCC, the kernel_irqchip is disabled by default, and we used the
> > >>> default
> > >> value.
> > >>
> > >> No, it's enabled by default.
> > >>
> > >
> > > Okay, yes, I saw the source code again. That means kmod finish the NMI
> > > injection wrok, and the NMI will not pass Qemu side. So, you thought
> > > this patch cannot block NMI injection when kernel_irqchip=on ?
> >
> > I am not sure.  It depends on which NMIs are blocked by the bit.  For
> > example, the IOAPIC can deliver NMIs, and they wouldn't be blocked.
> >
> > Do you have any documentation, to see whether they can actually happen on
> > emulated hardware?  I guess we support the TCO watchdog, so yes.
> >
> Yes, watchdog is one case, and we have another case which need to use NMI
> to
> tell guest do something when guest's cpu stuck or something like that.
> And I can invoke qmp command "inject-nmi" when SeaBIOS try to close NMI by
> invoking rtc_read() or rtc_write().
> 
> After the NMI injection, the guest will reboot:
> 
> [2015-12-14 16:41:57] In resume (status=0)
> [2015-12-14 16:41:57] In 32bit resume
> [2015-12-14 16:41:57] =====Attempting a hard reboot====
> [2015-12-14 16:41:58] SeaBIOS (version
> rel-1.8.1-0-g4adadbd-20151214_135833-linux-jAPTBr)
> 
> [snip]
> 
> So, I think we should handle those scenarios, just like the real hardware.
> 
> > > Maybe we should pass the nmi_disable bit to Kmod when
> kernel_irqchip=on ,
> > right?
> >
> > Yes, that's the idea.
> >
> That means I have much more work need to do.
> 
> > But first of all, I've read the thread you linked, and I couldn't find the place
> > where it says that the root cause is NMIs.
> >
> That's complete true. I haven't direct proof, but I think I eliminated
> all possible causes, except NMIs. Of course, if you find any other clues,
> please let me know.
> 
> The most trouble thing is I couldn't reproduce this problem. :(
> 
> Thanks,
> -Gonglei
Paolo Bonzini Dec. 15, 2015, 10:43 a.m. UTC | #10
> Hi Paolo,
> 
> /* for KVM_GET/SET_VCPU_EVENTS */
> struct kvm_vcpu_events {
>  ...
> struct {
> 		__u8 injected;
> 		__u8 pending;
> 		__u8 masked;
> 		__u8 pad;
> 	} nmi;
>  ...
> 
> I found that the nmi.masked property does these enable or disable NMI jobs.
> So, I think we don't need to add a new bit. Right?

nmi.masked says whether the CPU is accepting the NMIs, and is cleared
by the next IRET instruction.  This is a different thing; it probably
shouldn't affect NMI IPIs, and it definitely should remain set until
cleared via the RTC.  So it should be something like

    _u8 external_nmi_disabled;

or similar.

*However* I found this in the ICH9 datasheet:

    The ICH9's I/O APIC can only send interrupts due to interrupts which
    do not include SMI, NMI or INIT. This means that in IA-32/Intel ® 64
    based platforms, Front Side Bus interrupt message format delivery modes
    010 (SMI/PMI), 100 (NMI), and 101 (INIT) as indicated in this section,
    must not be used and is not supported.

In theory the PIIX4 could deliver such messages, but perhaps we could
disable them in the KVM IOAPIC.  If we do this, there is no need for a
change to struct kvm_vcpu_events, because all external NMI sources will
be in userspace.

Radim, what do you think?

Paolo
Radim Krčmář Dec. 15, 2015, 6:53 p.m. UTC | #11
2015-12-15 05:43-0500, Paolo Bonzini:
>> Hi Paolo,
>> 
>> /* for KVM_GET/SET_VCPU_EVENTS */
>> struct kvm_vcpu_events {
>>  ...
>> struct {
>> 		__u8 injected;
>> 		__u8 pending;
>> 		__u8 masked;
>> 		__u8 pad;
>> 	} nmi;
>>  ...
>> 
>> I found that the nmi.masked property does these enable or disable NMI jobs.
>> So, I think we don't need to add a new bit. Right?
> 
> nmi.masked says whether the CPU is accepting the NMIs, and is cleared
> by the next IRET instruction.  This is a different thing; it probably
> shouldn't affect NMI IPIs, and it definitely should remain set until
> cleared via the RTC.  So it should be something like
> 
>     _u8 external_nmi_disabled;
> 
> or similar.
> 
> *However* I found this in the ICH9 datasheet:
> 
>     The ICH9's I/O APIC can only send interrupts due to interrupts which
>     do not include SMI, NMI or INIT. This means that in IA-32/Intel ® 64
>     based platforms, Front Side Bus interrupt message format delivery modes
>     010 (SMI/PMI), 100 (NMI), and 101 (INIT) as indicated in this section,
>     must not be used and is not supported.
> 
> In theory the PIIX4 could deliver such messages, but perhaps we could
> disable them in the KVM IOAPIC.  If we do this, there is no need for a
> change to struct kvm_vcpu_events, because all external NMI sources will
> be in userspace.
> 
> Radim, what do you think?

I looked at the 440fx, piix, and 82083aa(ioapic) datasheets and the
NMI_EN bit doesn't seem to be propagated into the IOAPIC.
The IOAPIC datasheet doesn't mention a thing about NMI masking and PIIX4
generates NMI on SERR# or IOCHK# so it seems that the NMI_EN feature
only changes the behavior of those two ...

I think it's best to do nothing in KVM.

(q35 guests shouldn't configure IOAPIC to send unsupported messages and
 disabling SMI/NMI/INIT in the in-kernel IOAPIC for piix is risky.)
Gonglei (Arei) Dec. 16, 2015, 8:26 a.m. UTC | #12
Hi,

> -----Original Message-----

> From: Radim Krcmar [mailto:rkrcmar@redhat.com]

> Subject: Re: [PATCH] rtc: introduce nmi disable bit handler for cmos

> 

> 2015-12-15 05:43-0500, Paolo Bonzini:

> >> Hi Paolo,

> >>

> >> /* for KVM_GET/SET_VCPU_EVENTS */

> >> struct kvm_vcpu_events {

> >>  ...

> >> struct {

> >> 		__u8 injected;

> >> 		__u8 pending;

> >> 		__u8 masked;

> >> 		__u8 pad;

> >> 	} nmi;

> >>  ...

> >>

> >> I found that the nmi.masked property does these enable or disable NMI

> jobs.

> >> So, I think we don't need to add a new bit. Right?

> >

> > nmi.masked says whether the CPU is accepting the NMIs, and is cleared

> > by the next IRET instruction.  This is a different thing; it probably

> > shouldn't affect NMI IPIs, and it definitely should remain set until

> > cleared via the RTC.  So it should be something like

> >

> >     _u8 external_nmi_disabled;

> >

> > or similar.

> >

OK, I see, thanks.

> > *However* I found this in the ICH9 datasheet:

> >

> >     The ICH9's I/O APIC can only send interrupts due to interrupts which

> >     do not include SMI, NMI or INIT. This means that in IA-32/Intel ® 64

> >     based platforms, Front Side Bus interrupt message format delivery

> modes

> >     010 (SMI/PMI), 100 (NMI), and 101 (INIT) as indicated in this section,

> >     must not be used and is not supported.

> >

> > In theory the PIIX4 could deliver such messages, but perhaps we could

> > disable them in the KVM IOAPIC.  If we do this, there is no need for a

> > change to struct kvm_vcpu_events, because all external NMI sources will

> > be in userspace.

> >

> > Radim, what do you think?

> 

> I looked at the 440fx, piix, and 82083aa(ioapic) datasheets and the

> NMI_EN bit doesn't seem to be propagated into the IOAPIC.

> The IOAPIC datasheet doesn't mention a thing about NMI masking and PIIX4

> generates NMI on SERR# or IOCHK# so it seems that the NMI_EN feature

> only changes the behavior of those two ...

> 

Currently, Qemu doesn't handle SERR# (bit 7) and IOCHK# (bit 6) of port 0x61. The
Linux kernel gets the nmi reason when invoke default_do_nmi(), Qemu always
return 0.

Regards,
-Gonglei

> I think it's best to do nothing in KVM.

> 

> (q35 guests shouldn't configure IOAPIC to send unsupported messages and

>  disabling SMI/NMI/INIT in the in-kernel IOAPIC for piix is risky.)
Paolo Bonzini Dec. 16, 2015, 8:51 a.m. UTC | #13
On 15/12/2015 19:53, Radim Krcmar wrote:
> 2015-12-15 05:43-0500, Paolo Bonzini:
>>> Hi Paolo,
>>>
>>> /* for KVM_GET/SET_VCPU_EVENTS */
>>> struct kvm_vcpu_events {
>>>  ...
>>> struct {
>>> 		__u8 injected;
>>> 		__u8 pending;
>>> 		__u8 masked;
>>> 		__u8 pad;
>>> 	} nmi;
>>>  ...
>>>
>>> I found that the nmi.masked property does these enable or disable NMI jobs.
>>> So, I think we don't need to add a new bit. Right?
>>
>> nmi.masked says whether the CPU is accepting the NMIs, and is cleared
>> by the next IRET instruction.  This is a different thing; it probably
>> shouldn't affect NMI IPIs, and it definitely should remain set until
>> cleared via the RTC.  So it should be something like
>>
>>     _u8 external_nmi_disabled;
>>
>> or similar.
>>
>> *However* I found this in the ICH9 datasheet:
>>
>>     The ICH9's I/O APIC can only send interrupts due to interrupts which
>>     do not include SMI, NMI or INIT. This means that in IA-32/Intel ® 64
>>     based platforms, Front Side Bus interrupt message format delivery modes
>>     010 (SMI/PMI), 100 (NMI), and 101 (INIT) as indicated in this section,
>>     must not be used and is not supported.
>>
>> In theory the PIIX4 could deliver such messages, but perhaps we could
>> disable them in the KVM IOAPIC.  If we do this, there is no need for a
>> change to struct kvm_vcpu_events, because all external NMI sources will
>> be in userspace.
>>
>> Radim, what do you think?
> 
> I looked at the 440fx, piix, and 82083aa(ioapic) datasheets and the
> NMI_EN bit doesn't seem to be propagated into the IOAPIC.
> The IOAPIC datasheet doesn't mention a thing about NMI masking and PIIX4
> generates NMI on SERR# or IOCHK# so it seems that the NMI_EN feature
> only changes the behavior of those two ...
> 
> I think it's best to do nothing in KVM.

Then Gonglei's patch (apart from the issues that Eduardo pointed out) is
fine.

Paolo
diff mbox

Patch

diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
index 5b47056..deea49f 100644
--- a/hw/i386/kvm/apic.c
+++ b/hw/i386/kvm/apic.c
@@ -12,6 +12,7 @@ 
 #include "hw/i386/apic_internal.h"
 #include "hw/pci/msi.h"
 #include "sysemu/kvm.h"
+#include "sysemu/sysemu.h"
 
 static inline void kvm_apic_set_reg(struct kvm_lapic_state *kapic,
                                     int reg_id, uint32_t val)
@@ -132,7 +133,8 @@  static void do_inject_external_nmi(void *data)
     cpu_synchronize_state(cpu);
 
     lvt = s->lvt[APIC_LVT_LINT1];
-    if (!(lvt & APIC_LVT_MASKED) && ((lvt >> 8) & 7) == APIC_DM_NMI) {
+    if (!nmi_disabled && (lvt & APIC_LVT_MASKED)
+        && ((lvt >> 8) & 7) == APIC_DM_NMI) {
         ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
         if (ret < 0) {
             fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c
index a9f0efd..aaa8b4e 100644
--- a/hw/timer/mc146818rtc.c
+++ b/hw/timer/mc146818rtc.c
@@ -389,6 +389,17 @@  static void cmos_ioport_write(void *opaque, hwaddr addr,
     RTCState *s = opaque;
 
     if ((addr & 1) == 0) {
+        /*
+         * The Non-Maskable Interrupt (NMI) Enable bit is 0x80 bit of
+         * Port 0x70 (and its aliases). This bit must be 0b to enable
+         * the hardware chipset to send a Non-Maskable Interrupt. When
+         * set to a 1b, NMI's are disabled. This bit is commonly accessed
+         * by applications, BIOS, and even the operating system since it is
+         * used to block NMI assertions when sensitive code is executing.
+         */
+        nmi_disabled = !!(data & NMI_DISABLE_BIT);
+        CMOS_DPRINTF("cmos: nmi_disabled=%s\n",
+                     nmi_disabled ? "true" : "false");
         s->cmos_index = data & 0x7f;
     } else {
         CMOS_DPRINTF("cmos: write index=0x%02x val=0x%02" PRIx64 "\n",
diff --git a/include/hw/timer/mc146818rtc_regs.h b/include/hw/timer/mc146818rtc_regs.h
index ccdee42..175249f 100644
--- a/include/hw/timer/mc146818rtc_regs.h
+++ b/include/hw/timer/mc146818rtc_regs.h
@@ -64,4 +64,7 @@ 
 #define REG_C_AF   0x20
 #define REG_C_MASK 0x70
 
+/* PORT_CMOS_INDEX nmi disable bit */
+#define NMI_DISABLE_BIT 0x80
+
 #endif
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 3bb8897..a5b2342 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -177,6 +177,7 @@  extern uint8_t qemu_extra_params_fw[2];
 extern QEMUClockType rtc_clock;
 extern const char *mem_path;
 extern int mem_prealloc;
+extern bool nmi_disabled;
 
 #define MAX_NODES 128
 #define NUMA_NODE_UNASSIGNED MAX_NODES
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 6dc9846..abbd65b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -2456,9 +2456,9 @@  void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
     CPUX86State *env = &x86_cpu->env;
     int ret;
 
-    /* Inject NMI */
+    /* Inject NMI Or SMI */
     if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
-        if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
+        if (!nmi_disabled && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
             qemu_mutex_lock_iothread();
             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
             qemu_mutex_unlock_iothread();
diff --git a/vl.c b/vl.c
index 4211ff1..ff5b06f 100644
--- a/vl.c
+++ b/vl.c
@@ -185,6 +185,7 @@  bool boot_strict;
 uint8_t *boot_splash_filedata;
 size_t boot_splash_filedata_size;
 uint8_t qemu_extra_params_fw[2];
+bool nmi_disabled;
 
 int icount_align_option;