Patchwork [08/13] kvm: x86: Inject pending MCE events on state writeback

login
register
mail settings
Submitter Jan Kiszka
Date Feb. 15, 2011, 8:23 a.m.
Message ID <56bf7460c65f7fc5c89fa9673b880a556b99e0fc.1297758211.git.jan.kiszka@siemens.com>
Download mbox | patch
Permalink /patch/83210/
State New
Headers show

Comments

Jan Kiszka - Feb. 15, 2011, 8:23 a.m.
The current way of injecting MCE events without updating of and
synchronizing with the CPUState is broken and causes spurious
corruptions of the MCE-related parts of the CPUState.

As a first step towards a fix, enhance the state writeback code with
support for injecting events that are pending in the CPUState. A pending
exception will then be signaled via cpu_interrupt(CPU_INTERRUPT_MCE).
And, just like for TCG, we need to leave the halt state when
CPU_INTERRUPT_MCE is pending (left broken for the to-be-removed old KVM
code).

This will also allow to unify TCG and KVM injection code.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
CC: Huang Ying <ying.huang@intel.com>
CC: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
CC: Jin Dongming <jin.dongming@np.css.fujitsu.com>
---
 target-i386/kvm.c |   75 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 70 insertions(+), 5 deletions(-)
Marcelo Tosatti - Feb. 17, 2011, 4:35 p.m.
On Tue, Feb 15, 2011 at 09:23:32AM +0100, Jan Kiszka wrote:
> The current way of injecting MCE events without updating of and
> synchronizing with the CPUState is broken and causes spurious
> corruptions of the MCE-related parts of the CPUState.

Can you explain how? The current pronlem with MCE is that it bypasses 
writeback code, but corruption has nothing to do with that.

> As a first step towards a fix, enhance the state writeback code with
> support for injecting events that are pending in the CPUState. A pending
> exception will then be signaled via cpu_interrupt(CPU_INTERRUPT_MCE).
> And, just like for TCG, we need to leave the halt state when
> CPU_INTERRUPT_MCE is pending (left broken for the to-be-removed old KVM
> code).
> 
> This will also allow to unify TCG and KVM injection code.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> CC: Huang Ying <ying.huang@intel.com>
> CC: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
> CC: Jin Dongming <jin.dongming@np.css.fujitsu.com>
> ---
>  target-i386/kvm.c |   75 +++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 files changed, 70 insertions(+), 5 deletions(-)
> 
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index f909661..46f45db 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -467,6 +467,44 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
>  #endif /* !KVM_CAP_MCE*/
>  }
>  
> +static int kvm_inject_mce_oldstyle(CPUState *env)
> +{
> +#ifdef KVM_CAP_MCE
> +    if (kvm_has_vcpu_events()) {
> +        return 0;
> +    }
> +    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
> +        unsigned int bank, bank_num = env->mcg_cap & 0xff;
> +        struct kvm_x86_mce mce;
> +
> +        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
> +        assert(env->mcg_cap);
> +
> +        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
> +
> +        /*
> +         * There must be at least one bank in use if CPU_INTERRUPT_MCE was set.
> +         * Find it and use its values for the event injection.
> +         */
> +        for (bank = 0; bank < bank_num; bank++) {
> +            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
> +                break;
> +            }
> +        }
> +        assert(bank < bank_num);
> +
> +        mce.bank = bank;
> +        mce.status = env->mce_banks[bank * 4 + 1];
> +        mce.mcg_status = env->mcg_status;
> +        mce.addr = env->mce_banks[bank * 4 + 2];
> +        mce.misc = env->mce_banks[bank * 4 + 3];
> +
> +        return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, &mce);
> +    }
> +#endif /* KVM_CAP_MCE */
> +    return 0;
> +}
> +
>  static void cpu_update_state(void *opaque, int running, int reason)
>  {
>      CPUState *env = opaque;
> @@ -1375,10 +1413,25 @@ static int kvm_put_vcpu_events(CPUState *env, int level)
>          return 0;
>      }
>  
> -    events.exception.injected = (env->exception_injected >= 0);
> -    events.exception.nr = env->exception_injected;
> -    events.exception.has_error_code = env->has_error_code;
> -    events.exception.error_code = env->error_code;
> +    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
> +        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
> +        assert(env->mcg_cap);
> +
> +        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
> +        if (env->exception_injected == EXCP08_DBLE) {
> +            /* this means triple fault */
> +            qemu_system_reset_request();
> +            env->exit_request = 1;
> +        }
> +        events.exception.injected = 1;
> +        events.exception.nr = EXCP12_MCHK;
> +        events.exception.has_error_code = 0;
> +    } else {
> +        events.exception.injected = (env->exception_injected >= 0);
> +        events.exception.nr = env->exception_injected;
> +        events.exception.has_error_code = env->has_error_code;
> +        events.exception.error_code = env->error_code;
> +    }

IMO it is important to maintain a scope for kvm_put_vcpu_events /
kvm_get_vcpu_events: they synchronize state to/from the kernel. Not more
than that. Whatever you're trying to do here should be higher in the
vcpu loop code.

>      events.interrupt.injected = (env->interrupt_injected >= 0);
>      events.interrupt.nr = env->interrupt_injected;
> @@ -1539,6 +1592,11 @@ int kvm_arch_put_registers(CPUState *env, int level)
>      if (ret < 0) {
>          return ret;
>      }
> +    /* must be before kvm_put_msrs */
> +    ret = kvm_inject_mce_oldstyle(env);
> +    if (ret < 0) {
> +        return ret;
> +    }
>      ret = kvm_put_msrs(env, level);
>      if (ret < 0) {
>          return ret;
> @@ -1678,10 +1736,17 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
>  int kvm_arch_process_irqchip_events(CPUState *env)
>  {
>      if (kvm_irqchip_in_kernel()) {
> +        if (env->interrupt_request & CPU_INTERRUPT_MCE) {
> +            kvm_cpu_synchronize_state(env);
> +            if (env->mp_state == KVM_MP_STATE_HALTED) {
> +                env->mp_state = KVM_MP_STATE_RUNNABLE;
> +            }
> +        }

Should not manipulate mp_state of a running vcpu (should only do that
for migration when vcpu is stopped), since its managed by the kernel,
for irqchip case.

>          return 0;
>      }
>  
> -    if (env->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI)) {
> +    if (env->interrupt_request &
> +        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI | CPU_INTERRUPT_MCE)) {
>          env->halted = 0;
>      }
>      if (env->interrupt_request & CPU_INTERRUPT_INIT) {
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka - Feb. 17, 2011, 5:06 p.m.
On 2011-02-17 17:35, Marcelo Tosatti wrote:
> On Tue, Feb 15, 2011 at 09:23:32AM +0100, Jan Kiszka wrote:
>> The current way of injecting MCE events without updating of and
>> synchronizing with the CPUState is broken and causes spurious
>> corruptions of the MCE-related parts of the CPUState.
> 
> Can you explain how? The current pronlem with MCE is that it bypasses 
> writeback code, but corruption has nothing to do with that.

It's precisely the same scenario as with the old debug exception
re-injection: If we update the pending exception state via
KVM_SET_VCPU_EVENTS, we must not inject it via any other path. Otherwise
we end up with overwritten/lost events - which is extremely critical for
this rarely taken code paths.

Jut like parts of KVM_SET_GUEST_DEBUG, KVM_X86_SET_MCE pre-dates
KVM_SET_VCPU_EVENTS which obsoleted all other exception injection
mechanisms.

> 
>> As a first step towards a fix, enhance the state writeback code with
>> support for injecting events that are pending in the CPUState. A pending
>> exception will then be signaled via cpu_interrupt(CPU_INTERRUPT_MCE).
>> And, just like for TCG, we need to leave the halt state when
>> CPU_INTERRUPT_MCE is pending (left broken for the to-be-removed old KVM
>> code).
>>
>> This will also allow to unify TCG and KVM injection code.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> CC: Huang Ying <ying.huang@intel.com>
>> CC: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
>> CC: Jin Dongming <jin.dongming@np.css.fujitsu.com>
>> ---
>>  target-i386/kvm.c |   75 +++++++++++++++++++++++++++++++++++++++++++++++++---
>>  1 files changed, 70 insertions(+), 5 deletions(-)
>>
>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>> index f909661..46f45db 100644
>> --- a/target-i386/kvm.c
>> +++ b/target-i386/kvm.c
>> @@ -467,6 +467,44 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
>>  #endif /* !KVM_CAP_MCE*/
>>  }
>>  
>> +static int kvm_inject_mce_oldstyle(CPUState *env)
>> +{
>> +#ifdef KVM_CAP_MCE
>> +    if (kvm_has_vcpu_events()) {
>> +        return 0;
>> +    }
>> +    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
>> +        unsigned int bank, bank_num = env->mcg_cap & 0xff;
>> +        struct kvm_x86_mce mce;
>> +
>> +        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
>> +        assert(env->mcg_cap);
>> +
>> +        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
>> +
>> +        /*
>> +         * There must be at least one bank in use if CPU_INTERRUPT_MCE was set.
>> +         * Find it and use its values for the event injection.
>> +         */
>> +        for (bank = 0; bank < bank_num; bank++) {
>> +            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
>> +                break;
>> +            }
>> +        }
>> +        assert(bank < bank_num);
>> +
>> +        mce.bank = bank;
>> +        mce.status = env->mce_banks[bank * 4 + 1];
>> +        mce.mcg_status = env->mcg_status;
>> +        mce.addr = env->mce_banks[bank * 4 + 2];
>> +        mce.misc = env->mce_banks[bank * 4 + 3];
>> +
>> +        return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, &mce);
>> +    }
>> +#endif /* KVM_CAP_MCE */
>> +    return 0;
>> +}
>> +
>>  static void cpu_update_state(void *opaque, int running, int reason)
>>  {
>>      CPUState *env = opaque;
>> @@ -1375,10 +1413,25 @@ static int kvm_put_vcpu_events(CPUState *env, int level)
>>          return 0;
>>      }
>>  
>> -    events.exception.injected = (env->exception_injected >= 0);
>> -    events.exception.nr = env->exception_injected;
>> -    events.exception.has_error_code = env->has_error_code;
>> -    events.exception.error_code = env->error_code;
>> +    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
>> +        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
>> +        assert(env->mcg_cap);
>> +
>> +        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
>> +        if (env->exception_injected == EXCP08_DBLE) {
>> +            /* this means triple fault */
>> +            qemu_system_reset_request();
>> +            env->exit_request = 1;
>> +        }
>> +        events.exception.injected = 1;
>> +        events.exception.nr = EXCP12_MCHK;
>> +        events.exception.has_error_code = 0;
>> +    } else {
>> +        events.exception.injected = (env->exception_injected >= 0);
>> +        events.exception.nr = env->exception_injected;
>> +        events.exception.has_error_code = env->has_error_code;
>> +        events.exception.error_code = env->error_code;
>> +    }
> 
> IMO it is important to maintain a scope for kvm_put_vcpu_events /
> kvm_get_vcpu_events: they synchronize state to/from the kernel. Not more
> than that. Whatever you're trying to do here should be higher in the
> vcpu loop code.

We pick up CPU_INTERRUPT_MCE and translate it into the right exception
that put_vcpu_events is about to sync to the kernel. What should be done
earlier of those steps? Calculating env->exception_injected?

> 
>>      events.interrupt.injected = (env->interrupt_injected >= 0);
>>      events.interrupt.nr = env->interrupt_injected;
>> @@ -1539,6 +1592,11 @@ int kvm_arch_put_registers(CPUState *env, int level)
>>      if (ret < 0) {
>>          return ret;
>>      }
>> +    /* must be before kvm_put_msrs */
>> +    ret = kvm_inject_mce_oldstyle(env);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>>      ret = kvm_put_msrs(env, level);
>>      if (ret < 0) {
>>          return ret;
>> @@ -1678,10 +1736,17 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
>>  int kvm_arch_process_irqchip_events(CPUState *env)
>>  {
>>      if (kvm_irqchip_in_kernel()) {
>> +        if (env->interrupt_request & CPU_INTERRUPT_MCE) {
>> +            kvm_cpu_synchronize_state(env);
>> +            if (env->mp_state == KVM_MP_STATE_HALTED) {
>> +                env->mp_state = KVM_MP_STATE_RUNNABLE;
>> +            }
>> +        }
> 
> Should not manipulate mp_state of a running vcpu (should only do that
> for migration when vcpu is stopped), since its managed by the kernel,
> for irqchip case.

Not for asynchronously injected MCEs. The target CPU would simply
oversleep them. MCEs are not in the scope of the in-kernel irqchip.

Jan
Marcelo Tosatti - Feb. 17, 2011, 5:55 p.m.
On Thu, Feb 17, 2011 at 06:06:19PM +0100, Jan Kiszka wrote:
> On 2011-02-17 17:35, Marcelo Tosatti wrote:
> > On Tue, Feb 15, 2011 at 09:23:32AM +0100, Jan Kiszka wrote:
> >> The current way of injecting MCE events without updating of and
> >> synchronizing with the CPUState is broken and causes spurious
> >> corruptions of the MCE-related parts of the CPUState.
> > 
> > Can you explain how? The current pronlem with MCE is that it bypasses 
> > writeback code, but corruption has nothing to do with that.
> 
> It's precisely the same scenario as with the old debug exception
> re-injection: If we update the pending exception state via
> KVM_SET_VCPU_EVENTS, we must not inject it via any other path. Otherwise
> we end up with overwritten/lost events - which is extremely critical for
> this rarely taken code paths.
> 
> Jut like parts of KVM_SET_GUEST_DEBUG, KVM_X86_SET_MCE pre-dates
> KVM_SET_VCPU_EVENTS which obsoleted all other exception injection
> mechanisms.

OK.

> > 
> >> As a first step towards a fix, enhance the state writeback code with
> >> support for injecting events that are pending in the CPUState. A pending
> >> exception will then be signaled via cpu_interrupt(CPU_INTERRUPT_MCE).
> >> And, just like for TCG, we need to leave the halt state when
> >> CPU_INTERRUPT_MCE is pending (left broken for the to-be-removed old KVM
> >> code).
> >>
> >> This will also allow to unify TCG and KVM injection code.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> >> CC: Huang Ying <ying.huang@intel.com>
> >> CC: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
> >> CC: Jin Dongming <jin.dongming@np.css.fujitsu.com>
> >> ---
> >>  target-i386/kvm.c |   75 +++++++++++++++++++++++++++++++++++++++++++++++++---
> >>  1 files changed, 70 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> >> index f909661..46f45db 100644
> >> --- a/target-i386/kvm.c
> >> +++ b/target-i386/kvm.c
> >> @@ -467,6 +467,44 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> >>  #endif /* !KVM_CAP_MCE*/
> >>  }
> >>  
> >> +static int kvm_inject_mce_oldstyle(CPUState *env)
> >> +{
> >> +#ifdef KVM_CAP_MCE
> >> +    if (kvm_has_vcpu_events()) {
> >> +        return 0;
> >> +    }
> >> +    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
> >> +        unsigned int bank, bank_num = env->mcg_cap & 0xff;
> >> +        struct kvm_x86_mce mce;
> >> +
> >> +        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
> >> +        assert(env->mcg_cap);
> >> +
> >> +        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
> >> +
> >> +        /*
> >> +         * There must be at least one bank in use if CPU_INTERRUPT_MCE was set.
> >> +         * Find it and use its values for the event injection.
> >> +         */
> >> +        for (bank = 0; bank < bank_num; bank++) {
> >> +            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
> >> +                break;
> >> +            }
> >> +        }
> >> +        assert(bank < bank_num);
> >> +
> >> +        mce.bank = bank;
> >> +        mce.status = env->mce_banks[bank * 4 + 1];
> >> +        mce.mcg_status = env->mcg_status;
> >> +        mce.addr = env->mce_banks[bank * 4 + 2];
> >> +        mce.misc = env->mce_banks[bank * 4 + 3];
> >> +
> >> +        return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, &mce);
> >> +    }
> >> +#endif /* KVM_CAP_MCE */
> >> +    return 0;
> >> +}
> >> +
> >>  static void cpu_update_state(void *opaque, int running, int reason)
> >>  {
> >>      CPUState *env = opaque;
> >> @@ -1375,10 +1413,25 @@ static int kvm_put_vcpu_events(CPUState *env, int level)
> >>          return 0;
> >>      }
> >>  
> >> -    events.exception.injected = (env->exception_injected >= 0);
> >> -    events.exception.nr = env->exception_injected;
> >> -    events.exception.has_error_code = env->has_error_code;
> >> -    events.exception.error_code = env->error_code;
> >> +    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
> >> +        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
> >> +        assert(env->mcg_cap);
> >> +
> >> +        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
> >> +        if (env->exception_injected == EXCP08_DBLE) {
> >> +            /* this means triple fault */
> >> +            qemu_system_reset_request();
> >> +            env->exit_request = 1;
> >> +        }
> >> +        events.exception.injected = 1;
> >> +        events.exception.nr = EXCP12_MCHK;
> >> +        events.exception.has_error_code = 0;
> >> +    } else {
> >> +        events.exception.injected = (env->exception_injected >= 0);
> >> +        events.exception.nr = env->exception_injected;
> >> +        events.exception.has_error_code = env->has_error_code;
> >> +        events.exception.error_code = env->error_code;
> >> +    }
> > 
> > IMO it is important to maintain a scope for kvm_put_vcpu_events /
> > kvm_get_vcpu_events: they synchronize state to/from the kernel. Not more
> > than that. Whatever you're trying to do here should be higher in the
> > vcpu loop code.
> 
> We pick up CPU_INTERRUPT_MCE and translate it into the right exception
> that put_vcpu_events is about to sync to the kernel. What should be done
> earlier of those steps? Calculating env->exception_injected?

Everything but writeback. Update env->exception_injected/nr in
process_irqchip_events, or in a separate kvm_arch_update_exceptions.

> >>          return ret;
> >> @@ -1678,10 +1736,17 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
> >>  int kvm_arch_process_irqchip_events(CPUState *env)
> >>  {
> >>      if (kvm_irqchip_in_kernel()) {
> >> +        if (env->interrupt_request & CPU_INTERRUPT_MCE) {
> >> +            kvm_cpu_synchronize_state(env);
> >> +            if (env->mp_state == KVM_MP_STATE_HALTED) {
> >> +                env->mp_state = KVM_MP_STATE_RUNNABLE;
> >> +            }
> >> +        }
> > 
> > Should not manipulate mp_state of a running vcpu (should only do that
> > for migration when vcpu is stopped), since its managed by the kernel,
> > for irqchip case.
> 
> Not for asynchronously injected MCEs. The target CPU would simply
> oversleep them. MCEs are not in the scope of the in-kernel irqchip.

Pending MCE exception could break out of in-kernel halt emulation.
Jan Kiszka - Feb. 17, 2011, 6:04 p.m.
On 2011-02-17 18:55, Marcelo Tosatti wrote:
>>>> @@ -1375,10 +1413,25 @@ static int kvm_put_vcpu_events(CPUState *env, int level)
>>>>          return 0;
>>>>      }
>>>>  
>>>> -    events.exception.injected = (env->exception_injected >= 0);
>>>> -    events.exception.nr = env->exception_injected;
>>>> -    events.exception.has_error_code = env->has_error_code;
>>>> -    events.exception.error_code = env->error_code;
>>>> +    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
>>>> +        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
>>>> +        assert(env->mcg_cap);
>>>> +
>>>> +        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
>>>> +        if (env->exception_injected == EXCP08_DBLE) {
>>>> +            /* this means triple fault */
>>>> +            qemu_system_reset_request();
>>>> +            env->exit_request = 1;
>>>> +        }
>>>> +        events.exception.injected = 1;
>>>> +        events.exception.nr = EXCP12_MCHK;
>>>> +        events.exception.has_error_code = 0;
>>>> +    } else {
>>>> +        events.exception.injected = (env->exception_injected >= 0);
>>>> +        events.exception.nr = env->exception_injected;
>>>> +        events.exception.has_error_code = env->has_error_code;
>>>> +        events.exception.error_code = env->error_code;
>>>> +    }
>>>
>>> IMO it is important to maintain a scope for kvm_put_vcpu_events /
>>> kvm_get_vcpu_events: they synchronize state to/from the kernel. Not more
>>> than that. Whatever you're trying to do here should be higher in the
>>> vcpu loop code.
>>
>> We pick up CPU_INTERRUPT_MCE and translate it into the right exception
>> that put_vcpu_events is about to sync to the kernel. What should be done
>> earlier of those steps? Calculating env->exception_injected?
> 
> Everything but writeback. Update env->exception_injected/nr in
> process_irqchip_events, or in a separate kvm_arch_update_exceptions.
> 

OK, will rework this.

>>>>          return ret;
>>>> @@ -1678,10 +1736,17 @@ void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
>>>>  int kvm_arch_process_irqchip_events(CPUState *env)
>>>>  {
>>>>      if (kvm_irqchip_in_kernel()) {
>>>> +        if (env->interrupt_request & CPU_INTERRUPT_MCE) {
>>>> +            kvm_cpu_synchronize_state(env);
>>>> +            if (env->mp_state == KVM_MP_STATE_HALTED) {
>>>> +                env->mp_state = KVM_MP_STATE_RUNNABLE;
>>>> +            }
>>>> +        }
>>>
>>> Should not manipulate mp_state of a running vcpu (should only do that
>>> for migration when vcpu is stopped), since its managed by the kernel,
>>> for irqchip case.
>>
>> Not for asynchronously injected MCEs. The target CPU would simply
>> oversleep them. MCEs are not in the scope of the in-kernel irqchip.
> 
> Pending MCE exception could break out of in-kernel halt emulation.

Can't follow. What do you mean? That the kernel already takes care? I
didn't find a trace, so I added that code.

Jan
Marcelo Tosatti - Feb. 17, 2011, 6:17 p.m.
On Thu, Feb 17, 2011 at 07:04:51PM +0100, Jan Kiszka wrote:
> >>> Should not manipulate mp_state of a running vcpu (should only do that
> >>> for migration when vcpu is stopped), since its managed by the kernel,
> >>> for irqchip case.
> >>
> >> Not for asynchronously injected MCEs. The target CPU would simply
> >> oversleep them. MCEs are not in the scope of the in-kernel irqchip.
> > 
> > Pending MCE exception could break out of in-kernel halt emulation.
> 
> Can't follow. What do you mean? That the kernel already takes care? I
> didn't find a trace, so I added that code.

Nevermind. This is rare and "halted -> running" transition in userspace 
is harmless.

Patch

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index f909661..46f45db 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -467,6 +467,44 @@  void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
 #endif /* !KVM_CAP_MCE*/
 }
 
+static int kvm_inject_mce_oldstyle(CPUState *env)
+{
+#ifdef KVM_CAP_MCE
+    if (kvm_has_vcpu_events()) {
+        return 0;
+    }
+    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
+        unsigned int bank, bank_num = env->mcg_cap & 0xff;
+        struct kvm_x86_mce mce;
+
+        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
+        assert(env->mcg_cap);
+
+        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
+
+        /*
+         * There must be at least one bank in use if CPU_INTERRUPT_MCE was set.
+         * Find it and use its values for the event injection.
+         */
+        for (bank = 0; bank < bank_num; bank++) {
+            if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
+                break;
+            }
+        }
+        assert(bank < bank_num);
+
+        mce.bank = bank;
+        mce.status = env->mce_banks[bank * 4 + 1];
+        mce.mcg_status = env->mcg_status;
+        mce.addr = env->mce_banks[bank * 4 + 2];
+        mce.misc = env->mce_banks[bank * 4 + 3];
+
+        return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, &mce);
+    }
+#endif /* KVM_CAP_MCE */
+    return 0;
+}
+
 static void cpu_update_state(void *opaque, int running, int reason)
 {
     CPUState *env = opaque;
@@ -1375,10 +1413,25 @@  static int kvm_put_vcpu_events(CPUState *env, int level)
         return 0;
     }
 
-    events.exception.injected = (env->exception_injected >= 0);
-    events.exception.nr = env->exception_injected;
-    events.exception.has_error_code = env->has_error_code;
-    events.exception.error_code = env->error_code;
+    if (env->interrupt_request & CPU_INTERRUPT_MCE) {
+        /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
+        assert(env->mcg_cap);
+
+        env->interrupt_request &= ~CPU_INTERRUPT_MCE;
+        if (env->exception_injected == EXCP08_DBLE) {
+            /* this means triple fault */
+            qemu_system_reset_request();
+            env->exit_request = 1;
+        }
+        events.exception.injected = 1;
+        events.exception.nr = EXCP12_MCHK;
+        events.exception.has_error_code = 0;
+    } else {
+        events.exception.injected = (env->exception_injected >= 0);
+        events.exception.nr = env->exception_injected;
+        events.exception.has_error_code = env->has_error_code;
+        events.exception.error_code = env->error_code;
+    }
 
     events.interrupt.injected = (env->interrupt_injected >= 0);
     events.interrupt.nr = env->interrupt_injected;
@@ -1539,6 +1592,11 @@  int kvm_arch_put_registers(CPUState *env, int level)
     if (ret < 0) {
         return ret;
     }
+    /* must be before kvm_put_msrs */
+    ret = kvm_inject_mce_oldstyle(env);
+    if (ret < 0) {
+        return ret;
+    }
     ret = kvm_put_msrs(env, level);
     if (ret < 0) {
         return ret;
@@ -1678,10 +1736,17 @@  void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
 int kvm_arch_process_irqchip_events(CPUState *env)
 {
     if (kvm_irqchip_in_kernel()) {
+        if (env->interrupt_request & CPU_INTERRUPT_MCE) {
+            kvm_cpu_synchronize_state(env);
+            if (env->mp_state == KVM_MP_STATE_HALTED) {
+                env->mp_state = KVM_MP_STATE_RUNNABLE;
+            }
+        }
         return 0;
     }
 
-    if (env->interrupt_request & (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI)) {
+    if (env->interrupt_request &
+        (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI | CPU_INTERRUPT_MCE)) {
         env->halted = 0;
     }
     if (env->interrupt_request & CPU_INTERRUPT_INIT) {