Patchwork [08/10] MCE: Relay UCR MCE to guest

login
register
mail settings
Submitter Marcelo Tosatti
Date Oct. 20, 2010, 5:43 p.m.
Message ID <10ae5833ff9de153c311917d532f3e84e5b00387.1287596626.git.mtosatti@redhat.com>
Download mbox | patch
Permalink /patch/68461/
State New
Headers show

Comments

Marcelo Tosatti - Oct. 20, 2010, 5:43 p.m.
Port qemu-kvm's

commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
Author: Huang Ying <ying.huang@intel.com>
Date:   Mon Sep 21 10:43:25 2009 +0800

    MCE: Relay UCR MCE to guest

    UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
    where some hardware error such as some memory error can be reported
    without PCC (processor context corrupted). To recover from such MCE,
    the corresponding memory will be unmapped, and all processes accessing
    the memory will be killed via SIGBUS.

    For KVM, if QEMU/KVM is killed, all guest processes will be killed
    too. So we relay SIGBUS from host OS to guest system via a UCR MCE
    injection. Then guest OS can isolate corresponding memory and kill
    necessary guest processes only. SIGBUS sent to main thread (not VCPU
    threads) will be broadcast to all VCPU threads as UCR MCE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 cpus.c                |   82 ++++++++++++++++++++--
 kvm-stub.c            |    5 ++
 kvm.h                 |    3 +
 target-i386/cpu.h     |   20 +++++-
 target-i386/helper.c  |    2 +-
 target-i386/kvm.c     |  178 ++++++++++++++++++++++++++++++++++++++++++++++++-
 target-i386/kvm_x86.h |    3 +-
 7 files changed, 279 insertions(+), 14 deletions(-)
Anthony Liguori - Oct. 20, 2010, 7:51 p.m.
On 10/20/2010 12:43 PM, Marcelo Tosatti wrote:
> Port qemu-kvm's
>
> commit 4b62fff1101a7ad77553147717a8bd3bf79df7ef
> Author: Huang Ying<ying.huang@intel.com>
> Date:   Mon Sep 21 10:43:25 2009 +0800
>
>      MCE: Relay UCR MCE to guest
>
>      UCR (uncorrected recovery) MCE is supported in recent Intel CPUs,
>      where some hardware error such as some memory error can be reported
>      without PCC (processor context corrupted). To recover from such MCE,
>      the corresponding memory will be unmapped, and all processes accessing
>      the memory will be killed via SIGBUS.
>
>      For KVM, if QEMU/KVM is killed, all guest processes will be killed
>      too. So we relay SIGBUS from host OS to guest system via a UCR MCE
>      injection. Then guest OS can isolate corresponding memory and kill
>      necessary guest processes only. SIGBUS sent to main thread (not VCPU
>      threads) will be broadcast to all VCPU threads as UCR MCE.
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
> Signed-off-by: Avi Kivity<avi@redhat.com>
> ---
>   cpus.c                |   82 ++++++++++++++++++++--
>   kvm-stub.c            |    5 ++
>   kvm.h                 |    3 +
>   target-i386/cpu.h     |   20 +++++-
>   target-i386/helper.c  |    2 +-
>   target-i386/kvm.c     |  178 ++++++++++++++++++++++++++++++++++++++++++++++++-
>   target-i386/kvm_x86.h |    3 +-
>   7 files changed, 279 insertions(+), 14 deletions(-)
>
> diff --git a/cpus.c b/cpus.c
> index 3875657..ad58c55 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -34,6 +34,10 @@
>
>   #include "cpus.h"
>   #include "compatfd.h"
> +#ifdef CONFIG_LINUX
> +#include<sys/prctl.h>
> +#include<sys/signalfd.h>
> +#endif
>    

signalfd() was introduced in 2.6.22 but this is unconditionally 
included.  This is going to break the build on any old Linux systems.

>   #ifdef SIGRTMIN
>   #define SIG_IPI (SIGRTMIN+4)
> @@ -41,6 +45,10 @@
>   #define SIG_IPI SIGUSR1
>   #endif
>
> +#ifndef PR_MCE_KILL
> +#define PR_MCE_KILL 33
> +#endif
> +
>   static CPUState *next_cpu;
>
>   /***********************************************************/
> @@ -498,28 +506,77 @@ static void qemu_tcg_wait_io_event(void)
>       }
>   }
>
> +static void sigbus_reraise(void)
> +{
> +    sigset_t set;
> +    struct sigaction action;
> +
> +    memset(&action, 0, sizeof(action));
> +    action.sa_handler = SIG_DFL;
> +    if (!sigaction(SIGBUS,&action, NULL)) {
> +        raise(SIGBUS);
> +        sigemptyset(&set);
> +        sigaddset(&set, SIGBUS);
> +        sigprocmask(SIG_UNBLOCK,&set, NULL);
> +    }
> +    perror("Failed to re-raise SIGBUS!\n");
> +    abort();
> +}
> +
> +static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
> +                           void *ctx)
> +{
> +#if defined(TARGET_I386)
> +    if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
> +#endif
> +        sigbus_reraise();
>    

This violates CODING style.  Why not just get rid of the sigbus handler 
when not on TARGET_I386?

> +}
> +
>   static void qemu_kvm_eat_signal(CPUState *env, int timeout)
>   {
>       struct timespec ts;
>       int r, e;
>       siginfo_t siginfo;
>       sigset_t waitset;
> +    sigset_t chkset;
>
>       ts.tv_sec = timeout / 1000;
>       ts.tv_nsec = (timeout % 1000) * 1000000;
>
>       sigemptyset(&waitset);
>       sigaddset(&waitset, SIG_IPI);
> +    sigaddset(&waitset, SIGBUS);
>
> -    qemu_mutex_unlock(&qemu_global_mutex);
> -    r = sigtimedwait(&waitset,&siginfo,&ts);
> -    e = errno;
> -    qemu_mutex_lock(&qemu_global_mutex);
> +    do {
> +        qemu_mutex_unlock(&qemu_global_mutex);
>
> -    if (r == -1&&  !(e == EAGAIN || e == EINTR)) {
> -        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> -        exit(1);
> -    }
> +        r = sigtimedwait(&waitset,&siginfo,&ts);
> +        e = errno;
> +
> +        qemu_mutex_lock(&qemu_global_mutex);
> +
> +        if (r == -1&&  !(e == EAGAIN || e == EINTR)) {
> +            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> +            exit(1);
> +        }
> +
> +        switch (r) {
> +        case SIGBUS:
> +#ifdef TARGET_I386
> +            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
> +#endif
> +                sigbus_reraise();
> +            break;
> +        default:
> +            break;
> +        }
> +
> +        r = sigpending(&chkset);
> +        if (r == -1) {
> +            fprintf(stderr, "sigpending: %s\n", strerror(e));
> +            exit(1);
> +        }
> +    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
>   }
>    

I don't understand why this loop is needed but we specifically wait for 
a signal to get delivered that's either SIG_IPI or SIGBUS.  We then 
check whether a SIG_IPI or SIGBUS is pending and loop waiting for 
signals again.

Shouldn't we be looping on just sigismember(SIGBUS)?

BTW, we're no longer respecting timeout because we're not adjusting ts 
after each iteration.

>   static void qemu_kvm_wait_io_event(CPUState *env)
> @@ -640,6 +697,7 @@ static void kvm_init_ipi(CPUState *env)
>
>       pthread_sigmask(SIG_BLOCK, NULL,&set);
>       sigdelset(&set, SIG_IPI);
> +    sigdelset(&set, SIGBUS);
>       r = kvm_set_signal_mask(env,&set);
>       if (r) {
>           fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
> @@ -650,6 +708,7 @@ static void kvm_init_ipi(CPUState *env)
>   static sigset_t block_io_signals(void)
>   {
>       sigset_t set;
> +    struct sigaction action;
>
>       /* SIGUSR2 used by posix-aio-compat.c */
>       sigemptyset(&set);
> @@ -660,8 +719,15 @@ static sigset_t block_io_signals(void)
>       sigaddset(&set, SIGIO);
>       sigaddset(&set, SIGALRM);
>       sigaddset(&set, SIG_IPI);
> +    sigaddset(&set, SIGBUS);
>       pthread_sigmask(SIG_BLOCK,&set, NULL);
>
> +    memset(&action, 0, sizeof(action));
> +    action.sa_flags = SA_SIGINFO;
> +    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
> +    sigaction(SIGBUS,&action, NULL);
> +    prctl(PR_MCE_KILL, 1, 1, 0, 0);
>    

We don't seem to do any checking of the return value here.  EINVAL seems 
like an expected error to me.

>       return set;
>   }
>
> diff --git a/kvm-stub.c b/kvm-stub.c
> index d45f9fa..5384a4b 100644
> --- a/kvm-stub.c
> +++ b/kvm-stub.c
> @@ -141,3 +141,8 @@ int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign)
>   {
>       return -ENOSYS;
>   }
> +
> +int kvm_on_sigbus(int code, void *addr)
> +{
> +    return 1;
> +}
> diff --git a/kvm.h b/kvm.h
> index b2fb3af..60a9b42 100644
> --- a/kvm.h
> +++ b/kvm.h
> @@ -110,6 +110,9 @@ int kvm_arch_init_vcpu(CPUState *env);
>
>   void kvm_arch_reset_vcpu(CPUState *env);
>
> +int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr);
> +int kvm_on_sigbus(int code, void *addr);
> +
>   struct kvm_guest_debug;
>   struct kvm_debug_exit_arch;
>
> diff --git a/target-i386/cpu.h b/target-i386/cpu.h
> index 77eeab1..85ed30f 100644
> --- a/target-i386/cpu.h
> +++ b/target-i386/cpu.h
> @@ -250,16 +250,32 @@
>   #define PG_ERROR_RSVD_MASK 0x08
>   #define PG_ERROR_I_D_MASK  0x10
>
> -#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
> +#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
> +#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
>
> -#define MCE_CAP_DEF	MCG_CTL_P
> +#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
>   #define MCE_BANKS_DEF	10
>
> +#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
> +#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
>   #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
>
>   #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
>   #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
>   #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
> +#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
> +#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
> +#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
> +#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
> +#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
> +#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
> +
> +/* MISC register defines */
> +#define MCM_ADDR_SEGOFF	0	/* segment offset */
> +#define MCM_ADDR_LINEAR	1	/* linear address */
> +#define MCM_ADDR_PHYS	2	/* physical address */
> +#define MCM_ADDR_MEM	3	/* memory address */
> +#define MCM_ADDR_GENERIC 7	/* generic */
>
>   #define MSR_IA32_TSC                    0x10
>   #define MSR_IA32_APICBASE               0x1b
> diff --git a/target-i386/helper.c b/target-i386/helper.c
> index 4b430dd..4fff4a8 100644
> --- a/target-i386/helper.c
> +++ b/target-i386/helper.c
> @@ -1032,7 +1032,7 @@ void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
>           return;
>
>       if (kvm_enabled()) {
> -        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
> +        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
>           return;
>       }
>
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 343fb02..8e26bc4 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -46,6 +46,13 @@
>   #define MSR_KVM_WALL_CLOCK  0x11
>   #define MSR_KVM_SYSTEM_TIME 0x12
>
> +#ifndef BUS_MCEERR_AR
> +#define BUS_MCEERR_AR 4
> +#endif
> +#ifndef BUS_MCEERR_AO
> +#define BUS_MCEERR_AO 5
> +#endif
> +
>   #ifdef KVM_CAP_EXT_CPUID
>
>   static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
> @@ -192,10 +199,39 @@ static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
>       return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
>   }
>
> +static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
> +{
> +    struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
>    

sizeof should take ()s.

> +    int r;
> +
> +    kmsrs->nmsrs = n;
> +    memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
> +    r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
> +    memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
> +    free(kmsrs);
> +    return r;
> +}
> +
> +/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
> +static int kvm_mce_in_exception(CPUState *env)
> +{
> +    struct kvm_msr_entry msr_mcg_status = {
> +        .index = MSR_MCG_STATUS,
> +    };
> +    int r;
> +
> +    r = kvm_get_msr(env,&msr_mcg_status, 1);
> +    if (r == -1 || r == 0) {
> +        return -1;
> +    }
> +    return !!(msr_mcg_status.data&  MCG_STATUS_MCIP);
> +}
> +
>   struct kvm_x86_mce_data
>   {
>       CPUState *env;
>       struct kvm_x86_mce *mce;
> +    int abort_on_error;
>   };
>
>   static void kvm_do_inject_x86_mce(void *_data)
> @@ -203,14 +239,26 @@ static void kvm_do_inject_x86_mce(void *_data)
>       struct kvm_x86_mce_data *data = _data;
>       int r;
>
> +    /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
> +    r = kvm_mce_in_exception(data->env);
> +    if (r == -1)
> +        fprintf(stderr, "Failed to get MCE status\n");
> +    else if (r&&  !(data->mce->status&  MCI_STATUS_AR))
> +        return;
> +
>    

CODING_STYLE.

>       r = kvm_set_mce(data->env, data->mce);
> -    if (r<  0)
> +    if (r<  0) {
>           perror("kvm_set_mce FAILED");
> +        if (data->abort_on_error) {
> +            abort();
> +        }
> +    }
>   }
>   #endif
>
>   void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> -                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
> +                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
> +                        int abort_on_error)
>   {
>   #ifdef KVM_CAP_MCE
>       struct kvm_x86_mce mce = {
> @@ -225,7 +273,15 @@ void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
>               .mce =&mce,
>       };
>
> +    if (!cenv->mcg_cap) {
> +        fprintf(stderr, "MCE support is not enabled!\n");
> +        return;
>    

But we unconditionally enable this functionality so that means we're 
going to get spammed on stderr whenever an MCE occurs.

We really should not register a sigbus handler unless KVM supports 
injection and let the process get killed.

> +    }
> +
>       run_on_cpu(cenv, kvm_do_inject_x86_mce,&data);
> +#else
> +    if (abort_on_error)
> +        abort();
>    

CODING_STYLE

>   #endif
>   }
>
> @@ -1528,3 +1584,121 @@ bool kvm_arch_stop_on_emulation_error(CPUState *env)
>                 ((env->segs[R_CS].selector&  3) != 3);
>   }
>
> +static void hardware_memory_error(void)
> +{
> +    fprintf(stderr, "Hardware memory error!\n");
> +    exit(1);
> +}
>    

This shouldn't be here.

> +int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
> +{
> +#if defined(KVM_CAP_MCE)
> +    struct kvm_x86_mce mce = {
> +            .bank = 9,
> +    };
> +    void *vaddr;
> +    ram_addr_t ram_addr;
> +    target_phys_addr_t paddr;
> +    int r;
> +
> +    if ((env->mcg_cap&  MCG_SER_P)&&  addr
> +&&  (code == BUS_MCEERR_AR
> +            || code == BUS_MCEERR_AO)) {
> +        if (code == BUS_MCEERR_AR) {
> +            /* Fake an Intel architectural Data Load SRAR UCR */
> +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +                | MCI_STATUS_AR | 0x134;
> +            mce.misc = (MCM_ADDR_PHYS<<  6) | 0xc;
> +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
> +        } else {
> +            /*
> +             * If there is an MCE excpetion being processed, ignore
> +             * this SRAO MCE
> +             */
> +            r = kvm_mce_in_exception(env);
> +            if (r == -1) {
> +                fprintf(stderr, "Failed to get MCE status\n");
> +            } else if (r) {
> +                return 0;
> +            }
> +            /* Fake an Intel architectural Memory scrubbing UCR */
> +            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +                | 0xc0;
> +            mce.misc = (MCM_ADDR_PHYS<<  6) | 0xc;
> +            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
> +        }
> +        vaddr = (void *)addr;
> +        if (qemu_ram_addr_from_host(vaddr,&ram_addr) ||
> +            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr,&paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instead of guest system!\n");
>    

This is not a very useful way to handle this condition.  why not at 
least reraise a SIGBUS so that we have a proper core?

> +            /* Hope we are lucky for AO MCE */
> +            if (code == BUS_MCEERR_AO) {
> +                return 0;
> +            } else {
> +                hardware_memory_error();
> +            }
> +        }
> +        mce.addr = paddr;
> +        r = kvm_set_mce(env,&mce);
> +        if (r<  0) {
> +            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
> +            abort();
>    

There are calls to hardware_memory_error() and fprintf() in the same 
function.  It all should be error_report().

> +        }
> +    } else
> +#endif
> +    {
> +        if (code == BUS_MCEERR_AO) {
> +            return 0;
> +        } else if (code == BUS_MCEERR_AR) {
> +            hardware_memory_error();
> +        } else {
> +            return 1;
> +        }
> +    }
> +    return 0;
> +}
> +
> +int kvm_on_sigbus(int code, void *addr)
> +{
> +#if defined(KVM_CAP_MCE)
> +    if ((first_cpu->mcg_cap&  MCG_SER_P)&&  addr&&  code == BUS_MCEERR_AO) {
> +        uint64_t status;
> +        void *vaddr;
> +        target_phys_addr_t ram_addr;
> +        unsigned long paddr;
>    

ram_addr should be ram_addr_t and paddr should be target_phys_addr_t.

> +        CPUState *cenv;
> +
> +        /* Hope we are lucky for AO MCE */
> +        vaddr = addr;
> +        if (qemu_ram_addr_from_host(vaddr,&ram_addr) ||
> +            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr,&paddr)) {
> +            fprintf(stderr, "Hardware memory error for memory used by "
> +                    "QEMU itself instead of guest system!: %p\n", addr);
>    

Again, this is not a useful way to handle this.

> +            return 0;
> +        }
> +        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
> +            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
> +            | 0xc0;
> +        kvm_inject_x86_mce(first_cpu, 9, status,
> +                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
> +                           (MCM_ADDR_PHYS<<  6) | 0xc, 1);
> +        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
> +            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
> +                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
> +        }
> +    } else
> +#endif
> +    {
> +        if (code == BUS_MCEERR_AO) {
> +            return 0;
> +        } else if (code == BUS_MCEERR_AR) {
> +            hardware_memory_error();
> +        } else {
> +            return 1;
> +        }
> +    }
> +    return 0;
> +}
> diff --git a/target-i386/kvm_x86.h b/target-i386/kvm_x86.h
> index c1ebd24..04932cf 100644
> --- a/target-i386/kvm_x86.h
> +++ b/target-i386/kvm_x86.h
> @@ -16,6 +16,7 @@
>   #define __KVM_X86_H__
>
>   void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
> -                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
> +                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
> +                        int abort_on_error);
>
>   #endif
>    

Regards,

Anthony Liguori
Anthony Liguori - Oct. 20, 2010, 8:59 p.m.
On 10/20/2010 02:51 PM, Anthony Liguori wrote:
>
>> +}
>> +
>>   static void qemu_kvm_eat_signal(CPUState *env, int timeout)
>>   {
>>       struct timespec ts;
>>       int r, e;
>>       siginfo_t siginfo;
>>       sigset_t waitset;
>> +    sigset_t chkset;
>>
>>       ts.tv_sec = timeout / 1000;
>>       ts.tv_nsec = (timeout % 1000) * 1000000;
>>
>>       sigemptyset(&waitset);
>>       sigaddset(&waitset, SIG_IPI);
>> +    sigaddset(&waitset, SIGBUS);
>>
>> -    qemu_mutex_unlock(&qemu_global_mutex);
>> -    r = sigtimedwait(&waitset,&siginfo,&ts);
>> -    e = errno;
>> -    qemu_mutex_lock(&qemu_global_mutex);
>> +    do {
>> +        qemu_mutex_unlock(&qemu_global_mutex);
>>
>> -    if (r == -1&&  !(e == EAGAIN || e == EINTR)) {
>> -        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
>> -        exit(1);
>> -    }
>> +        r = sigtimedwait(&waitset,&siginfo,&ts);
>> +        e = errno;
>> +
>> +        qemu_mutex_lock(&qemu_global_mutex);
>> +
>> +        if (r == -1&&  !(e == EAGAIN || e == EINTR)) {
>> +            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
>> +            exit(1);
>> +        }
>> +
>> +        switch (r) {
>> +        case SIGBUS:
>> +#ifdef TARGET_I386
>> +            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, 
>> siginfo.si_addr))
>> +#endif
>> +                sigbus_reraise();
>> +            break;
>> +        default:
>> +            break;
>> +        }
>> +
>> +        r = sigpending(&chkset);
>> +        if (r == -1) {
>> +            fprintf(stderr, "sigpending: %s\n", strerror(e));
>> +            exit(1);
>> +        }
>> +    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, 
>> SIGBUS));
>>   }
>
> I don't understand why this loop is needed but we specifically wait 
> for a signal to get delivered that's either SIG_IPI or SIGBUS.  We 
> then check whether a SIG_IPI or SIGBUS is pending and loop waiting for 
> signals again.
>
> Shouldn't we be looping on just sigismember(SIGBUS)?
>
> BTW, we're no longer respecting timeout because we're not adjusting ts 
> after each iteration.

I think this is important too.  The last time I went through the code 
and played around here, it wasn't possible to set timeout to a very, 
very large value because there are still things that we poll for (like 
whether shutdown has occurred).   If we loop indefinitely without 
reducing ts, we can potentially recreate an infinite timeout which means 
we won't catch any of the events we poll for.  This would be a very, 
very subtle bug to track down.

Regards,

Anthony Liguori
Marcelo Tosatti - Oct. 20, 2010, 9:28 p.m.
On Wed, Oct 20, 2010 at 02:51:56PM -0500, Anthony Liguori wrote:
> >+        e = errno;
> >+
> >+        qemu_mutex_lock(&qemu_global_mutex);
> >+
> >+        if (r == -1&&  !(e == EAGAIN || e == EINTR)) {
> >+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> >+            exit(1);
> >+        }
> >+
> >+        switch (r) {
> >+        case SIGBUS:
> >+#ifdef TARGET_I386
> >+            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
> >+#endif
> >+                sigbus_reraise();
> >+            break;
> >+        default:
> >+            break;
> >+        }
> >+
> >+        r = sigpending(&chkset);
> >+        if (r == -1) {
> >+            fprintf(stderr, "sigpending: %s\n", strerror(e));
> >+            exit(1);
> >+        }
> >+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
> >  }
> 
> I don't understand why this loop is needed but we specifically wait
> for a signal to get delivered that's either SIG_IPI or SIGBUS.  We
> then check whether a SIG_IPI or SIGBUS is pending and loop waiting
> for signals again.
>
> Shouldn't we be looping on just sigismember(SIGBUS)?

Think of SIG_IPI and SIGBUS pending. SIGBUS must be processed
immediately.

Yes, sigismember(SIGBUS) would be fine. But the current code too.

> BTW, we're no longer respecting timeout because we're not adjusting
> ts after each iteration.

Right, timeout not used at the moment.
Marcelo Tosatti - Oct. 20, 2010, 9:33 p.m.
On Wed, Oct 20, 2010 at 03:59:29PM -0500, Anthony Liguori wrote:
> On 10/20/2010 02:51 PM, Anthony Liguori wrote:
> >
> >>+}
> >>+
> >>  static void qemu_kvm_eat_signal(CPUState *env, int timeout)
> >>  {
> >>      struct timespec ts;
> >>      int r, e;
> >>      siginfo_t siginfo;
> >>      sigset_t waitset;
> >>+    sigset_t chkset;
> >>
> >>      ts.tv_sec = timeout / 1000;
> >>      ts.tv_nsec = (timeout % 1000) * 1000000;
> >>
> >>      sigemptyset(&waitset);
> >>      sigaddset(&waitset, SIG_IPI);
> >>+    sigaddset(&waitset, SIGBUS);
> >>
> >>-    qemu_mutex_unlock(&qemu_global_mutex);
> >>-    r = sigtimedwait(&waitset,&siginfo,&ts);
> >>-    e = errno;
> >>-    qemu_mutex_lock(&qemu_global_mutex);
> >>+    do {
> >>+        qemu_mutex_unlock(&qemu_global_mutex);
> >>
> >>-    if (r == -1&&  !(e == EAGAIN || e == EINTR)) {
> >>-        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> >>-        exit(1);
> >>-    }
> >>+        r = sigtimedwait(&waitset,&siginfo,&ts);
> >>+        e = errno;
> >>+
> >>+        qemu_mutex_lock(&qemu_global_mutex);
> >>+
> >>+        if (r == -1&&  !(e == EAGAIN || e == EINTR)) {
> >>+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
> >>+            exit(1);
> >>+        }
> >>+
> >>+        switch (r) {
> >>+        case SIGBUS:
> >>+#ifdef TARGET_I386
> >>+            if (kvm_on_sigbus_vcpu(env, siginfo.si_code,
> >>siginfo.si_addr))
> >>+#endif
> >>+                sigbus_reraise();
> >>+            break;
> >>+        default:
> >>+            break;
> >>+        }
> >>+
> >>+        r = sigpending(&chkset);
> >>+        if (r == -1) {
> >>+            fprintf(stderr, "sigpending: %s\n", strerror(e));
> >>+            exit(1);
> >>+        }
> >>+    } while (sigismember(&chkset, SIG_IPI) ||
> >>sigismember(&chkset, SIGBUS));
> >>  }
> >
> >I don't understand why this loop is needed but we specifically
> >wait for a signal to get delivered that's either SIG_IPI or
> >SIGBUS.  We then check whether a SIG_IPI or SIGBUS is pending and
> >loop waiting for signals again.
> >
> >Shouldn't we be looping on just sigismember(SIGBUS)?
> >
> >BTW, we're no longer respecting timeout because we're not
> >adjusting ts after each iteration.
> 
> I think this is important too.  The last time I went through the
> code and played around here, it wasn't possible to set timeout to a
> very, very large value because there are still things that we poll
> for (like whether shutdown has occurred).   If we loop indefinitely
> without reducing ts, we can potentially recreate an infinite timeout
> which means we won't catch any of the events we poll for.  This
> would be a very, very subtle bug to track down.

We should just kill timeout parameter, i don't see any use for it.
Paolo Bonzini - Oct. 20, 2010, 9:56 p.m.
On 10/20/2010 09:51 PM, Anthony Liguori wrote:
> I don't understand why this loop is needed but we specifically wait for
> a signal to get delivered that's either SIG_IPI or SIGBUS. We then check
> whether a SIG_IPI or SIGBUS is pending and loop waiting for signals again.
>
> Shouldn't we be looping on just sigismember(SIGBUS)?

You mean because SIG_IPI is a real-time signal and standard signals are 
delivered first?  OTOH, real-time signals can be queued multiple times 
so it makes sense to loop on SIG_IPI as well.

> BTW, we're no longer respecting timeout because we're not adjusting ts
> after each iteration.

The timeout of qemu_kvm_eat_signal is always zero.

Paolo
Anthony Liguori - Oct. 20, 2010, 10:03 p.m.
On 10/20/2010 04:56 PM, Paolo Bonzini wrote:
> On 10/20/2010 09:51 PM, Anthony Liguori wrote:
>> I don't understand why this loop is needed but we specifically wait for
>> a signal to get delivered that's either SIG_IPI or SIGBUS. We then check
>> whether a SIG_IPI or SIGBUS is pending and loop waiting for signals 
>> again.
>>
>> Shouldn't we be looping on just sigismember(SIGBUS)?
>
> You mean because SIG_IPI is a real-time signal and standard signals 
> are delivered first?  OTOH, real-time signals can be queued multiple 
> times so it makes sense to loop on SIG_IPI as well.
>
>> BTW, we're no longer respecting timeout because we're not adjusting ts
>> after each iteration.
>
> The timeout of qemu_kvm_eat_signal is always zero.

So then qemu_kvm_eat_signal purely polls and it will happily keep 
polling as long as there is a signal pending.

So what's the point of doing a sigtimedwait() and dropping qemu_mutex?  
Why not just check sigpending in a loop?

Regards,

Anthony Liguori

> Paolo
Paolo Bonzini - Oct. 21, 2010, 7:41 a.m.
On 10/21/2010 12:03 AM, Anthony Liguori wrote:
>>
>> The timeout of qemu_kvm_eat_signal is always zero.
>
> So then qemu_kvm_eat_signal purely polls and it will happily keep
> polling as long as there is a signal pending.
>
> So what's the point of doing a sigtimedwait() and dropping qemu_mutex?

I agree that keeping the qemu_mutex makes sense if you remove the 
timeout argument (which I even have a patch for, as part of my Win32 
iothread series).  Until there is the theoretical possibility of 
suspending the process, qemu_kvm_eat_signal should drop the mutex.

> Why not just check sigpending in a loop?

Because sigtimedwait eats the signal, unlike sigpending (and sigsuspend).

Patch

diff --git a/cpus.c b/cpus.c
index 3875657..ad58c55 100644
--- a/cpus.c
+++ b/cpus.c
@@ -34,6 +34,10 @@ 
 
 #include "cpus.h"
 #include "compatfd.h"
+#ifdef CONFIG_LINUX
+#include <sys/prctl.h>
+#include <sys/signalfd.h>
+#endif
 
 #ifdef SIGRTMIN
 #define SIG_IPI (SIGRTMIN+4)
@@ -41,6 +45,10 @@ 
 #define SIG_IPI SIGUSR1
 #endif
 
+#ifndef PR_MCE_KILL
+#define PR_MCE_KILL 33
+#endif
+
 static CPUState *next_cpu;
 
 /***********************************************************/
@@ -498,28 +506,77 @@  static void qemu_tcg_wait_io_event(void)
     }
 }
 
+static void sigbus_reraise(void)
+{
+    sigset_t set;
+    struct sigaction action;
+
+    memset(&action, 0, sizeof(action));
+    action.sa_handler = SIG_DFL;
+    if (!sigaction(SIGBUS, &action, NULL)) {
+        raise(SIGBUS);
+        sigemptyset(&set);
+        sigaddset(&set, SIGBUS);
+        sigprocmask(SIG_UNBLOCK, &set, NULL);
+    }
+    perror("Failed to re-raise SIGBUS!\n");
+    abort();
+}
+
+static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
+                           void *ctx)
+{
+#if defined(TARGET_I386)
+    if (kvm_on_sigbus(siginfo->ssi_code, (void *)(intptr_t)siginfo->ssi_addr))
+#endif
+        sigbus_reraise();
+}
+
 static void qemu_kvm_eat_signal(CPUState *env, int timeout)
 {
     struct timespec ts;
     int r, e;
     siginfo_t siginfo;
     sigset_t waitset;
+    sigset_t chkset;
 
     ts.tv_sec = timeout / 1000;
     ts.tv_nsec = (timeout % 1000) * 1000000;
 
     sigemptyset(&waitset);
     sigaddset(&waitset, SIG_IPI);
+    sigaddset(&waitset, SIGBUS);
 
-    qemu_mutex_unlock(&qemu_global_mutex);
-    r = sigtimedwait(&waitset, &siginfo, &ts);
-    e = errno;
-    qemu_mutex_lock(&qemu_global_mutex);
+    do {
+        qemu_mutex_unlock(&qemu_global_mutex);
 
-    if (r == -1 && !(e == EAGAIN || e == EINTR)) {
-        fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
-        exit(1);
-    }
+        r = sigtimedwait(&waitset, &siginfo, &ts);
+        e = errno;
+
+        qemu_mutex_lock(&qemu_global_mutex);
+
+        if (r == -1 && !(e == EAGAIN || e == EINTR)) {
+            fprintf(stderr, "sigtimedwait: %s\n", strerror(e));
+            exit(1);
+        }
+
+        switch (r) {
+        case SIGBUS:
+#ifdef TARGET_I386
+            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr))
+#endif
+                sigbus_reraise();
+            break;
+        default:
+            break;
+        }
+
+        r = sigpending(&chkset);
+        if (r == -1) {
+            fprintf(stderr, "sigpending: %s\n", strerror(e));
+            exit(1);
+        }
+    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 }
 
 static void qemu_kvm_wait_io_event(CPUState *env)
@@ -640,6 +697,7 @@  static void kvm_init_ipi(CPUState *env)
 
     pthread_sigmask(SIG_BLOCK, NULL, &set);
     sigdelset(&set, SIG_IPI);
+    sigdelset(&set, SIGBUS);
     r = kvm_set_signal_mask(env, &set);
     if (r) {
         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(r));
@@ -650,6 +708,7 @@  static void kvm_init_ipi(CPUState *env)
 static sigset_t block_io_signals(void)
 {
     sigset_t set;
+    struct sigaction action;
 
     /* SIGUSR2 used by posix-aio-compat.c */
     sigemptyset(&set);
@@ -660,8 +719,15 @@  static sigset_t block_io_signals(void)
     sigaddset(&set, SIGIO);
     sigaddset(&set, SIGALRM);
     sigaddset(&set, SIG_IPI);
+    sigaddset(&set, SIGBUS);
     pthread_sigmask(SIG_BLOCK, &set, NULL);
 
+    memset(&action, 0, sizeof(action));
+    action.sa_flags = SA_SIGINFO;
+    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
+    sigaction(SIGBUS, &action, NULL);
+    prctl(PR_MCE_KILL, 1, 1, 0, 0);
+
     return set;
 }
 
diff --git a/kvm-stub.c b/kvm-stub.c
index d45f9fa..5384a4b 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -141,3 +141,8 @@  int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign)
 {
     return -ENOSYS;
 }
+
+int kvm_on_sigbus(int code, void *addr)
+{
+    return 1;
+}
diff --git a/kvm.h b/kvm.h
index b2fb3af..60a9b42 100644
--- a/kvm.h
+++ b/kvm.h
@@ -110,6 +110,9 @@  int kvm_arch_init_vcpu(CPUState *env);
 
 void kvm_arch_reset_vcpu(CPUState *env);
 
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr);
+int kvm_on_sigbus(int code, void *addr);
+
 struct kvm_guest_debug;
 struct kvm_debug_exit_arch;
 
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 77eeab1..85ed30f 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -250,16 +250,32 @@ 
 #define PG_ERROR_RSVD_MASK 0x08
 #define PG_ERROR_I_D_MASK  0x10
 
-#define MCG_CTL_P	(1UL<<8)   /* MCG_CAP register available */
+#define MCG_CTL_P	(1ULL<<8)   /* MCG_CAP register available */
+#define MCG_SER_P	(1ULL<<24) /* MCA recovery/new status bits */
 
-#define MCE_CAP_DEF	MCG_CTL_P
+#define MCE_CAP_DEF	(MCG_CTL_P|MCG_SER_P)
 #define MCE_BANKS_DEF	10
 
+#define MCG_STATUS_RIPV	(1ULL<<0)   /* restart ip valid */
+#define MCG_STATUS_EIPV	(1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP	(1ULL<<2)   /* machine check in progress */
 
 #define MCI_STATUS_VAL	(1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER	(1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC	(1ULL<<61)  /* uncorrected error */
+#define MCI_STATUS_EN	(1ULL<<60)  /* error enabled */
+#define MCI_STATUS_MISCV (1ULL<<59) /* misc error reg. valid */
+#define MCI_STATUS_ADDRV (1ULL<<58) /* addr reg. valid */
+#define MCI_STATUS_PCC	(1ULL<<57)  /* processor context corrupt */
+#define MCI_STATUS_S	(1ULL<<56)  /* Signaled machine check */
+#define MCI_STATUS_AR	(1ULL<<55)  /* Action required */
+
+/* MISC register defines */
+#define MCM_ADDR_SEGOFF	0	/* segment offset */
+#define MCM_ADDR_LINEAR	1	/* linear address */
+#define MCM_ADDR_PHYS	2	/* physical address */
+#define MCM_ADDR_MEM	3	/* memory address */
+#define MCM_ADDR_GENERIC 7	/* generic */
 
 #define MSR_IA32_TSC                    0x10
 #define MSR_IA32_APICBASE               0x1b
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 4b430dd..4fff4a8 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1032,7 +1032,7 @@  void cpu_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
         return;
 
     if (kvm_enabled()) {
-        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc);
+        kvm_inject_x86_mce(cenv, bank, status, mcg_status, addr, misc, 0);
         return;
     }
 
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 343fb02..8e26bc4 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -46,6 +46,13 @@ 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#ifndef BUS_MCEERR_AR
+#define BUS_MCEERR_AR 4
+#endif
+#ifndef BUS_MCEERR_AO
+#define BUS_MCEERR_AO 5
+#endif
+
 #ifdef KVM_CAP_EXT_CPUID
 
 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
@@ -192,10 +199,39 @@  static int kvm_set_mce(CPUState *env, struct kvm_x86_mce *m)
     return kvm_vcpu_ioctl(env, KVM_X86_SET_MCE, m);
 }
 
+static int kvm_get_msr(CPUState *env, struct kvm_msr_entry *msrs, int n)
+{
+    struct kvm_msrs *kmsrs = qemu_malloc(sizeof *kmsrs + n * sizeof *msrs);
+    int r;
+
+    kmsrs->nmsrs = n;
+    memcpy(kmsrs->entries, msrs, n * sizeof *msrs);
+    r = kvm_vcpu_ioctl(env, KVM_GET_MSRS, kmsrs);
+    memcpy(msrs, kmsrs->entries, n * sizeof *msrs);
+    free(kmsrs);
+    return r;
+}
+
+/* FIXME: kill this and kvm_get_msr, use env->mcg_status instead */
+static int kvm_mce_in_exception(CPUState *env)
+{
+    struct kvm_msr_entry msr_mcg_status = {
+        .index = MSR_MCG_STATUS,
+    };
+    int r;
+
+    r = kvm_get_msr(env, &msr_mcg_status, 1);
+    if (r == -1 || r == 0) {
+        return -1;
+    }
+    return !!(msr_mcg_status.data & MCG_STATUS_MCIP);
+}
+
 struct kvm_x86_mce_data
 {
     CPUState *env;
     struct kvm_x86_mce *mce;
+    int abort_on_error;
 };
 
 static void kvm_do_inject_x86_mce(void *_data)
@@ -203,14 +239,26 @@  static void kvm_do_inject_x86_mce(void *_data)
     struct kvm_x86_mce_data *data = _data;
     int r;
 
+    /* If there is an MCE excpetion being processed, ignore this SRAO MCE */
+    r = kvm_mce_in_exception(data->env);
+    if (r == -1)
+        fprintf(stderr, "Failed to get MCE status\n");
+    else if (r && !(data->mce->status & MCI_STATUS_AR))
+        return;
+
     r = kvm_set_mce(data->env, data->mce);
-    if (r < 0)
+    if (r < 0) {
         perror("kvm_set_mce FAILED");
+        if (data->abort_on_error) {
+            abort();
+        }
+    }
 }
 #endif
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc)
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error)
 {
 #ifdef KVM_CAP_MCE
     struct kvm_x86_mce mce = {
@@ -225,7 +273,15 @@  void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
             .mce = &mce,
     };
 
+    if (!cenv->mcg_cap) {
+        fprintf(stderr, "MCE support is not enabled!\n");
+        return;
+    }
+
     run_on_cpu(cenv, kvm_do_inject_x86_mce, &data);
+#else
+    if (abort_on_error)
+        abort();
 #endif
 }
 
@@ -1528,3 +1584,121 @@  bool kvm_arch_stop_on_emulation_error(CPUState *env)
               ((env->segs[R_CS].selector  & 3) != 3);
 }
 
+static void hardware_memory_error(void)
+{
+    fprintf(stderr, "Hardware memory error!\n");
+    exit(1);
+}
+
+int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    struct kvm_x86_mce mce = {
+            .bank = 9,
+    };
+    void *vaddr;
+    ram_addr_t ram_addr;
+    target_phys_addr_t paddr;
+    int r;
+
+    if ((env->mcg_cap & MCG_SER_P) && addr
+        && (code == BUS_MCEERR_AR
+            || code == BUS_MCEERR_AO)) {
+        if (code == BUS_MCEERR_AR) {
+            /* Fake an Intel architectural Data Load SRAR UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | MCI_STATUS_AR | 0x134;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+        } else {
+            /*
+             * If there is an MCE excpetion being processed, ignore
+             * this SRAO MCE
+             */
+            r = kvm_mce_in_exception(env);
+            if (r == -1) {
+                fprintf(stderr, "Failed to get MCE status\n");
+            } else if (r) {
+                return 0;
+            }
+            /* Fake an Intel architectural Memory scrubbing UCR */
+            mce.status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+                | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+                | 0xc0;
+            mce.misc = (MCM_ADDR_PHYS << 6) | 0xc;
+            mce.mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
+        }
+        vaddr = (void *)addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(env->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!\n");
+            /* Hope we are lucky for AO MCE */
+            if (code == BUS_MCEERR_AO) {
+                return 0;
+            } else {
+                hardware_memory_error();
+            }
+        }
+        mce.addr = paddr;
+        r = kvm_set_mce(env, &mce);
+        if (r < 0) {
+            fprintf(stderr, "kvm_set_mce: %s\n", strerror(errno));
+            abort();
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+int kvm_on_sigbus(int code, void *addr)
+{
+#if defined(KVM_CAP_MCE)
+    if ((first_cpu->mcg_cap & MCG_SER_P) && addr && code == BUS_MCEERR_AO) {
+        uint64_t status;
+        void *vaddr;
+        target_phys_addr_t ram_addr;
+        unsigned long paddr;
+        CPUState *cenv;
+
+        /* Hope we are lucky for AO MCE */
+        vaddr = addr;
+        if (qemu_ram_addr_from_host(vaddr, &ram_addr) ||
+            !kvm_physical_memory_addr_from_ram(first_cpu->kvm_state, ram_addr, &paddr)) {
+            fprintf(stderr, "Hardware memory error for memory used by "
+                    "QEMU itself instead of guest system!: %p\n", addr);
+            return 0;
+        }
+        status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN
+            | MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S
+            | 0xc0;
+        kvm_inject_x86_mce(first_cpu, 9, status,
+                           MCG_STATUS_MCIP | MCG_STATUS_RIPV, paddr,
+                           (MCM_ADDR_PHYS << 6) | 0xc, 1);
+        for (cenv = first_cpu->next_cpu; cenv != NULL; cenv = cenv->next_cpu) {
+            kvm_inject_x86_mce(cenv, 1, MCI_STATUS_VAL | MCI_STATUS_UC,
+                               MCG_STATUS_MCIP | MCG_STATUS_RIPV, 0, 0, 1);
+        }
+    } else
+#endif
+    {
+        if (code == BUS_MCEERR_AO) {
+            return 0;
+        } else if (code == BUS_MCEERR_AR) {
+            hardware_memory_error();
+        } else {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/target-i386/kvm_x86.h b/target-i386/kvm_x86.h
index c1ebd24..04932cf 100644
--- a/target-i386/kvm_x86.h
+++ b/target-i386/kvm_x86.h
@@ -16,6 +16,7 @@ 
 #define __KVM_X86_H__
 
 void kvm_inject_x86_mce(CPUState *cenv, int bank, uint64_t status,
-                        uint64_t mcg_status, uint64_t addr, uint64_t misc);
+                        uint64_t mcg_status, uint64_t addr, uint64_t misc,
+                        int abort_on_error);
 
 #endif