diff mbox

[v3,4/5] target/ppc: Handle NMI guest exit

Message ID 150287475974.9760.13295593936611613542.stgit@aravinda
State New
Headers show

Commit Message

Aravinda Prasad Aug. 16, 2017, 9:12 a.m. UTC
Memory error such as bit flips that cannot be corrected
by hardware are passed on to the kernel for handling.
If the memory address in error belongs to guest then
guest kernel is responsible for taking suitable action.
Patch [1] enhances KVM to exit guest with exit reason
set to KVM_EXIT_NMI in such cases.

This patch handles KVM_EXIT_NMI exit. If the guest OS
has registered the machine check handling routine by
calling "ibm,nmi-register", then the handler builds
the error log and invokes the registered handler else
invokes the handler at 0x200.

[1] https://www.spinics.net/lists/kvm-ppc/msg12637.html
	(e20bbd3d and related commits)

Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
---
 hw/ppc/spapr.c       |    4 ++
 target/ppc/kvm.c     |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++
 target/ppc/kvm_ppc.h |   81 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 171 insertions(+)

Comments

David Gibson Aug. 17, 2017, 1:57 a.m. UTC | #1
On Wed, Aug 16, 2017 at 02:42:39PM +0530, Aravinda Prasad wrote:
> Memory error such as bit flips that cannot be corrected
> by hardware are passed on to the kernel for handling.
> If the memory address in error belongs to guest then
> guest kernel is responsible for taking suitable action.
> Patch [1] enhances KVM to exit guest with exit reason
> set to KVM_EXIT_NMI in such cases.
> 
> This patch handles KVM_EXIT_NMI exit. If the guest OS
> has registered the machine check handling routine by
> calling "ibm,nmi-register", then the handler builds
> the error log and invokes the registered handler else
> invokes the handler at 0x200.
> 
> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html
> 	(e20bbd3d and related commits)
> 
> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c       |    4 ++
>  target/ppc/kvm.c     |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  target/ppc/kvm_ppc.h |   81 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 171 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 0bb2c4a..6cc3f69 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2346,6 +2346,10 @@ static void ppc_spapr_init(MachineState *machine)
>          error_report("Could not get size of LPAR rtas '%s'", filename);
>          exit(1);
>      }
> +
> +    /* Resize blob to accommodate error log. */
> +    spapr->rtas_size = RTAS_ERRLOG_OFFSET + sizeof(struct RtasMCELog);
> +
>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>          error_report("Could not load LPAR rtas '%s'", filename);
> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> index 8571379..73f64ed 100644
> --- a/target/ppc/kvm.c
> +++ b/target/ppc/kvm.c
> @@ -1782,6 +1782,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>          ret = 0;
>          break;
>  
> +    case KVM_EXIT_NMI:
> +        DPRINTF("handle NMI exception\n");
> +        ret = kvm_handle_nmi(cpu);
> +        break;
> +
>      default:
>          fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
>          ret = -1;
> @@ -2704,6 +2709,87 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
>      return data & 0xffff;
>  }
>  
> +int kvm_handle_nmi(PowerPCCPU *cpu)

So you only handle NMIs with KVM.  Wouldn't it make sense to also
handle them for TCG (where they can be triggered with the "nmi"
command on the monitor).

> +{
> +    struct RtasMCELog mc_log;
> +    CPUPPCState *env = &cpu->env;
> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +    target_ulong msr = 0;
> +
> +    cpu_synchronize_state(CPU(cpu));
> +
> +    /*
> +     * Properly set bits in MSR before we invoke the handler.
> +     * SRR0/1, DAR and DSISR are properly set by KVM
> +     */
> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
> +        msr |= (1ULL << MSR_LE);
> +    }
> +
> +    if (env->msr && (1ULL << MSR_SF)) {
> +        msr |= (1ULL << MSR_SF);
> +    }
> +
> +    msr |= (1ULL << MSR_ME);
> +    env->msr = msr;
> +
> +    if (!spapr->guest_machine_check_addr) {
> +        /*
> +         * If OS has not registered with "ibm,nmi-register"
> +         * jump to 0x200
> +         */
> +        env->nip = 0x200;
> +        return 0;
> +    }
> +
> +    while (spapr->mc_in_progress) {
> +        /*
> +         * Check whether the same CPU got machine check error
> +         * while still handling the mc error (i.e., before
> +         * that CPU called "ibm,nmi-interlock"
> +         */
> +        if (spapr->mc_cpu == cpu->cpu_dt_id) {
> +            qemu_system_guest_panicked(NULL);
> +        }
> +        qemu_cond_wait_iothread(&spapr->mc_delivery_cond);
> +    }
> +    spapr->mc_in_progress = true;
> +    spapr->mc_cpu = cpu->cpu_dt_id;

This will be merging against 2.11 and there are changes to the use of
cpu_dt_id in the ppc-for-2.11 tree which you'll need to rebase on top
of.

> +    /* Set error log fields */
> +    mc_log.r3 = env->gpr[3];
> +    mc_log.err_log.byte0 = 0;
> +    mc_log.err_log.byte1 =
> +        (RTAS_SEVERITY_ERROR_SYNC << RTAS_ELOG_SEVERITY_SHIFT);
> +    mc_log.err_log.byte1 |=
> +        (RTAS_DISP_NOT_RECOVERED << RTAS_ELOG_DISPOSITION_SHIFT);
> +    mc_log.err_log.byte2 =
> +        (RTAS_INITIATOR_MEMORY << RTAS_ELOG_INITIATOR_SHIFT);
> +    mc_log.err_log.byte2 |= RTAS_TARGET_MEMORY;
> +
> +    if (env->spr[SPR_DSISR] & P7_DSISR_MC_UE) {
> +        mc_log.err_log.byte3 = RTAS_TYPE_ECC_UNCORR;
> +    } else {
> +        mc_log.err_log.byte3 = 0;
> +    }
> +
> +    /* Handle all Host/Guest LE/BE combinations */
> +    if (env->msr & (1ULL << MSR_LE)) {
> +        mc_log.r3 = cpu_to_le64(mc_log.r3);
> +    } else {
> +        mc_log.r3 = cpu_to_be64(mc_log.r3);
> +    }

So, the r3 field is guest order, but the rest is fixed BE order, is
that right?


> +    cpu_physical_memory_write(spapr->rtas_addr + RTAS_ERRLOG_OFFSET,
> +                              &mc_log, sizeof(mc_log));
> +

You never set extended_log_length, so it doesn't look like the whole
structure is initialized.

> +    env->nip = spapr->guest_machine_check_addr;
> +    env->gpr[3] = spapr->rtas_addr + RTAS_ERRLOG_OFFSET;



> +    return 0;
> +}
> +
>  int kvmppc_enable_hwrng(void)
>  {
>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) {
> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
> index 6bc6fb3..bc8e3ce 100644
> --- a/target/ppc/kvm_ppc.h
> +++ b/target/ppc/kvm_ppc.h
> @@ -70,6 +70,87 @@ void kvmppc_update_sdr1(target_ulong sdr1);
>  
>  bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path);
>  
> +int kvm_handle_nmi(PowerPCCPU *cpu);
> +
> +/* Offset from rtas-base where error log is placed */
> +#define RTAS_ERRLOG_OFFSET       0x200
> +
> +#define RTAS_ELOG_SEVERITY_SHIFT         0x5
> +#define RTAS_ELOG_DISPOSITION_SHIFT      0x3
> +#define RTAS_ELOG_INITIATOR_SHIFT        0x4
> +
> +/*
> + * Only required RTAS event severity, disposition, initiator
> + * target and type are copied from arch/powerpc/include/asm/rtas.h
> + */
> +
> +/* RTAS event severity */
> +#define RTAS_SEVERITY_ERROR_SYNC    0x3
> +
> +/* RTAS event disposition */
> +#define RTAS_DISP_NOT_RECOVERED     0x2
> +
> +/* RTAS event initiator */
> +#define RTAS_INITIATOR_MEMORY       0x4
> +
> +/* RTAS event target */
> +#define RTAS_TARGET_MEMORY          0x4
> +
> +/* RTAS event type */
> +#define RTAS_TYPE_ECC_UNCORR        0x09
> +
> +/*
> + * Currently KVM only passes on the uncorrected machine
> + * check memory error to guest. Other machine check errors
> + * such as SLB multi-hit and TLB multi-hit are recovered
> + * in KVM and are not passed on to guest.
> + *
> + * DSISR Bit for uncorrected machine check error. Based
> + * on arch/powerpc/include/asm/mce.h
> + */
> +#define PPC_BIT(bit)                (0x8000000000000000ULL >> bit)
> +#define P7_DSISR_MC_UE              (PPC_BIT(48))  /* P8 too */
> +
> +/* Adopted from kernel source arch/powerpc/include/asm/rtas.h */
> +struct rtas_error_log {

There are already structures for rtas error logs in
hw/ppc/spapr_events.c; those should be re-used.  You can probably
share some code to transfer log entries to the guest with correct
endianness as well.

> +    /* Byte 0 */
> +    uint8_t     byte0;          /* Architectural version */
> +
> +    /* Byte 1 */
> +    uint8_t     byte1;
> +    /* XXXXXXXX
> +     * XXX      3: Severity level of error
> +     *    XX    2: Degree of recovery
> +     *      X   1: Extended log present?
> +     *       XX 2: Reserved
> +     */
> +
> +    /* Byte 2 */
> +    uint8_t     byte2;
> +    /* XXXXXXXX
> +     * XXXX     4: Initiator of event
> +     *     XXXX 4: Target of failed operation
> +     */
> +    uint8_t     byte3;          /* General event or error*/
> +    __be32      extended_log_length;    /* length in bytes */
> +    unsigned char   buffer[1];      /* Start of extended log */
> +                                /* Variable length.      */
> +};
> +
> +/*
> + * Data format in RTAS-Blob
> + *
> + * This structure contains error information related to Machine
> + * Check exception. This is filled up and copied to rtas-blob
> + * upon machine check exception. The address of rtas-blob is
> + * passed on to OS registered machine check notification
> + * routines upon machine check exception
> + */
> +struct RtasMCELog {
> +    target_ulong r3;
> +    struct rtas_error_log err_log;
> +};
> +
>  #else
>  
>  static inline uint32_t kvmppc_get_tbfreq(void)
>
Aravinda Prasad Aug. 21, 2017, 12:30 p.m. UTC | #2
On Thursday 17 August 2017 07:27 AM, David Gibson wrote:
> On Wed, Aug 16, 2017 at 02:42:39PM +0530, Aravinda Prasad wrote:
>> Memory error such as bit flips that cannot be corrected
>> by hardware are passed on to the kernel for handling.
>> If the memory address in error belongs to guest then
>> guest kernel is responsible for taking suitable action.
>> Patch [1] enhances KVM to exit guest with exit reason
>> set to KVM_EXIT_NMI in such cases.
>>
>> This patch handles KVM_EXIT_NMI exit. If the guest OS
>> has registered the machine check handling routine by
>> calling "ibm,nmi-register", then the handler builds
>> the error log and invokes the registered handler else
>> invokes the handler at 0x200.
>>
>> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html
>> 	(e20bbd3d and related commits)
>>
>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>> ---
>>  hw/ppc/spapr.c       |    4 ++
>>  target/ppc/kvm.c     |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  target/ppc/kvm_ppc.h |   81 +++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 171 insertions(+)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 0bb2c4a..6cc3f69 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -2346,6 +2346,10 @@ static void ppc_spapr_init(MachineState *machine)
>>          error_report("Could not get size of LPAR rtas '%s'", filename);
>>          exit(1);
>>      }
>> +
>> +    /* Resize blob to accommodate error log. */
>> +    spapr->rtas_size = RTAS_ERRLOG_OFFSET + sizeof(struct RtasMCELog);
>> +
>>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>>          error_report("Could not load LPAR rtas '%s'", filename);
>> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
>> index 8571379..73f64ed 100644
>> --- a/target/ppc/kvm.c
>> +++ b/target/ppc/kvm.c
>> @@ -1782,6 +1782,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>>          ret = 0;
>>          break;
>>  
>> +    case KVM_EXIT_NMI:
>> +        DPRINTF("handle NMI exception\n");
>> +        ret = kvm_handle_nmi(cpu);
>> +        break;
>> +
>>      default:
>>          fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
>>          ret = -1;
>> @@ -2704,6 +2709,87 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
>>      return data & 0xffff;
>>  }
>>  
>> +int kvm_handle_nmi(PowerPCCPU *cpu)
> 
> So you only handle NMIs with KVM.  Wouldn't it make sense to also
> handle them for TCG (where they can be triggered with the "nmi"
> command on the monitor).

For TCG it is already handled in spapr_nmi() which ultimately branches
to guest's 0x200 vector. Even with KVM when KVM_CAP_PPC_FWNMI is not
enabled we branch to guest's 0x200 vector. Should TCG behave as if
KVM_CAP_PPC_FWNMI is not enabled?

> 
>> +{
>> +    struct RtasMCELog mc_log;
>> +    CPUPPCState *env = &cpu->env;
>> +    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>> +    target_ulong msr = 0;
>> +
>> +    cpu_synchronize_state(CPU(cpu));
>> +
>> +    /*
>> +     * Properly set bits in MSR before we invoke the handler.
>> +     * SRR0/1, DAR and DSISR are properly set by KVM
>> +     */
>> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
>> +        msr |= (1ULL << MSR_LE);
>> +    }
>> +
>> +    if (env->msr && (1ULL << MSR_SF)) {
>> +        msr |= (1ULL << MSR_SF);
>> +    }
>> +
>> +    msr |= (1ULL << MSR_ME);
>> +    env->msr = msr;
>> +
>> +    if (!spapr->guest_machine_check_addr) {
>> +        /*
>> +         * If OS has not registered with "ibm,nmi-register"
>> +         * jump to 0x200
>> +         */
>> +        env->nip = 0x200;
>> +        return 0;
>> +    }
>> +
>> +    while (spapr->mc_in_progress) {
>> +        /*
>> +         * Check whether the same CPU got machine check error
>> +         * while still handling the mc error (i.e., before
>> +         * that CPU called "ibm,nmi-interlock"
>> +         */
>> +        if (spapr->mc_cpu == cpu->cpu_dt_id) {
>> +            qemu_system_guest_panicked(NULL);
>> +        }
>> +        qemu_cond_wait_iothread(&spapr->mc_delivery_cond);
>> +    }
>> +    spapr->mc_in_progress = true;
>> +    spapr->mc_cpu = cpu->cpu_dt_id;
> 
> This will be merging against 2.11 and there are changes to the use of
> cpu_dt_id in the ppc-for-2.11 tree which you'll need to rebase on top
> of.

ok.

> 
>> +    /* Set error log fields */
>> +    mc_log.r3 = env->gpr[3];
>> +    mc_log.err_log.byte0 = 0;
>> +    mc_log.err_log.byte1 =
>> +        (RTAS_SEVERITY_ERROR_SYNC << RTAS_ELOG_SEVERITY_SHIFT);
>> +    mc_log.err_log.byte1 |=
>> +        (RTAS_DISP_NOT_RECOVERED << RTAS_ELOG_DISPOSITION_SHIFT);
>> +    mc_log.err_log.byte2 =
>> +        (RTAS_INITIATOR_MEMORY << RTAS_ELOG_INITIATOR_SHIFT);
>> +    mc_log.err_log.byte2 |= RTAS_TARGET_MEMORY;
>> +
>> +    if (env->spr[SPR_DSISR] & P7_DSISR_MC_UE) {
>> +        mc_log.err_log.byte3 = RTAS_TYPE_ECC_UNCORR;
>> +    } else {
>> +        mc_log.err_log.byte3 = 0;
>> +    }
>> +
>> +    /* Handle all Host/Guest LE/BE combinations */
>> +    if (env->msr & (1ULL << MSR_LE)) {
>> +        mc_log.r3 = cpu_to_le64(mc_log.r3);
>> +    } else {
>> +        mc_log.r3 = cpu_to_be64(mc_log.r3);
>> +    }
> 
> So, the r3 field is guest order, but the rest is fixed BE order, is
> that right?

Yes, r3 field is guest order. For the rest it does not matter as the
these fields are just single bytes.

> 
> 
>> +    cpu_physical_memory_write(spapr->rtas_addr + RTAS_ERRLOG_OFFSET,
>> +                              &mc_log, sizeof(mc_log));
>> +
> 
> You never set extended_log_length, so it doesn't look like the whole
> structure is initialized.

According to PAPR, extended_log_length is valid only for version number
0x06 (the "byte0" member represents the version number). We don't use
extended event log and hence the version number is set to 0.

> 
>> +    env->nip = spapr->guest_machine_check_addr;
>> +    env->gpr[3] = spapr->rtas_addr + RTAS_ERRLOG_OFFSET;
> 
> 
> 
>> +    return 0;
>> +}
>> +
>>  int kvmppc_enable_hwrng(void)
>>  {
>>      if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) {
>> diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
>> index 6bc6fb3..bc8e3ce 100644
>> --- a/target/ppc/kvm_ppc.h
>> +++ b/target/ppc/kvm_ppc.h
>> @@ -70,6 +70,87 @@ void kvmppc_update_sdr1(target_ulong sdr1);
>>  
>>  bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path);
>>  
>> +int kvm_handle_nmi(PowerPCCPU *cpu);
>> +
>> +/* Offset from rtas-base where error log is placed */
>> +#define RTAS_ERRLOG_OFFSET       0x200
>> +
>> +#define RTAS_ELOG_SEVERITY_SHIFT         0x5
>> +#define RTAS_ELOG_DISPOSITION_SHIFT      0x3
>> +#define RTAS_ELOG_INITIATOR_SHIFT        0x4
>> +
>> +/*
>> + * Only required RTAS event severity, disposition, initiator
>> + * target and type are copied from arch/powerpc/include/asm/rtas.h
>> + */
>> +
>> +/* RTAS event severity */
>> +#define RTAS_SEVERITY_ERROR_SYNC    0x3
>> +
>> +/* RTAS event disposition */
>> +#define RTAS_DISP_NOT_RECOVERED     0x2
>> +
>> +/* RTAS event initiator */
>> +#define RTAS_INITIATOR_MEMORY       0x4
>> +
>> +/* RTAS event target */
>> +#define RTAS_TARGET_MEMORY          0x4
>> +
>> +/* RTAS event type */
>> +#define RTAS_TYPE_ECC_UNCORR        0x09
>> +
>> +/*
>> + * Currently KVM only passes on the uncorrected machine
>> + * check memory error to guest. Other machine check errors
>> + * such as SLB multi-hit and TLB multi-hit are recovered
>> + * in KVM and are not passed on to guest.
>> + *
>> + * DSISR Bit for uncorrected machine check error. Based
>> + * on arch/powerpc/include/asm/mce.h
>> + */
>> +#define PPC_BIT(bit)                (0x8000000000000000ULL >> bit)
>> +#define P7_DSISR_MC_UE              (PPC_BIT(48))  /* P8 too */
>> +
>> +/* Adopted from kernel source arch/powerpc/include/asm/rtas.h */
>> +struct rtas_error_log {
> 
> There are already structures for rtas error logs in
> hw/ppc/spapr_events.c; those should be re-used.  You can probably
> share some code to transfer log entries to the guest with correct
> endianness as well.

Sure. I will reuse. With those structures we need to take care of
endianness as the log is defined as two uint32_t members not individual
bytes.

struct rtas_error_log {
    uint32_t summary;
    uint32_t extended_length;
} QEMU_PACKED;

Regards,
Aravinda

> 
>> +    /* Byte 0 */
>> +    uint8_t     byte0;          /* Architectural version */
>> +
>> +    /* Byte 1 */
>> +    uint8_t     byte1;
>> +    /* XXXXXXXX
>> +     * XXX      3: Severity level of error
>> +     *    XX    2: Degree of recovery
>> +     *      X   1: Extended log present?
>> +     *       XX 2: Reserved
>> +     */
>> +
>> +    /* Byte 2 */
>> +    uint8_t     byte2;
>> +    /* XXXXXXXX
>> +     * XXXX     4: Initiator of event
>> +     *     XXXX 4: Target of failed operation
>> +     */
>> +    uint8_t     byte3;          /* General event or error*/
>> +    __be32      extended_log_length;    /* length in bytes */
>> +    unsigned char   buffer[1];      /* Start of extended log */
>> +                                /* Variable length.      */
>> +};
>> +
>> +/*
>> + * Data format in RTAS-Blob
>> + *
>> + * This structure contains error information related to Machine
>> + * Check exception. This is filled up and copied to rtas-blob
>> + * upon machine check exception. The address of rtas-blob is
>> + * passed on to OS registered machine check notification
>> + * routines upon machine check exception
>> + */
>> +struct RtasMCELog {
>> +    target_ulong r3;
>> +    struct rtas_error_log err_log;
>> +};
>> +
>>  #else
>>  
>>  static inline uint32_t kvmppc_get_tbfreq(void)
>>
>
David Gibson Aug. 23, 2017, 8:39 a.m. UTC | #3
On Mon, Aug 21, 2017 at 06:00:52PM +0530, Aravinda Prasad wrote:
> 
> 
> On Thursday 17 August 2017 07:27 AM, David Gibson wrote:
> > On Wed, Aug 16, 2017 at 02:42:39PM +0530, Aravinda Prasad wrote:
> >> Memory error such as bit flips that cannot be corrected
> >> by hardware are passed on to the kernel for handling.
> >> If the memory address in error belongs to guest then
> >> guest kernel is responsible for taking suitable action.
> >> Patch [1] enhances KVM to exit guest with exit reason
> >> set to KVM_EXIT_NMI in such cases.
> >>
> >> This patch handles KVM_EXIT_NMI exit. If the guest OS
> >> has registered the machine check handling routine by
> >> calling "ibm,nmi-register", then the handler builds
> >> the error log and invokes the registered handler else
> >> invokes the handler at 0x200.
> >>
> >> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html
> >> 	(e20bbd3d and related commits)
> >>
> >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >> ---
> >>  hw/ppc/spapr.c       |    4 ++
> >>  target/ppc/kvm.c     |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  target/ppc/kvm_ppc.h |   81 +++++++++++++++++++++++++++++++++++++++++++++++
> >>  3 files changed, 171 insertions(+)
> >>
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index 0bb2c4a..6cc3f69 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -2346,6 +2346,10 @@ static void ppc_spapr_init(MachineState *machine)
> >>          error_report("Could not get size of LPAR rtas '%s'", filename);
> >>          exit(1);
> >>      }
> >> +
> >> +    /* Resize blob to accommodate error log. */
> >> +    spapr->rtas_size = RTAS_ERRLOG_OFFSET + sizeof(struct RtasMCELog);
> >> +
> >>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
> >>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
> >>          error_report("Could not load LPAR rtas '%s'", filename);
> >> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> >> index 8571379..73f64ed 100644
> >> --- a/target/ppc/kvm.c
> >> +++ b/target/ppc/kvm.c
> >> @@ -1782,6 +1782,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
> >>          ret = 0;
> >>          break;
> >>  
> >> +    case KVM_EXIT_NMI:
> >> +        DPRINTF("handle NMI exception\n");
> >> +        ret = kvm_handle_nmi(cpu);
> >> +        break;
> >> +
> >>      default:
> >>          fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
> >>          ret = -1;
> >> @@ -2704,6 +2709,87 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
> >>      return data & 0xffff;
> >>  }
> >>  
> >> +int kvm_handle_nmi(PowerPCCPU *cpu)
> > 
> > So you only handle NMIs with KVM.  Wouldn't it make sense to also
> > handle them for TCG (where they can be triggered with the "nmi"
> > command on the monitor).
> 
> For TCG it is already handled in spapr_nmi() which ultimately branches
> to guest's 0x200 vector. Even with KVM when KVM_CAP_PPC_FWNMI is not
> enabled we branch to guest's 0x200 vector. Should TCG behave as if
> KVM_CAP_PPC_FWNMI is not enabled?

No, we should implement the FWNMI feature for TCG, which means it
needs to respect the address given by nmi-register.
Aravinda Prasad Sept. 8, 2017, 8:09 a.m. UTC | #4
On Wednesday 23 August 2017 02:09 PM, David Gibson wrote:
> On Mon, Aug 21, 2017 at 06:00:52PM +0530, Aravinda Prasad wrote:
>>
>>
>> On Thursday 17 August 2017 07:27 AM, David Gibson wrote:
>>> On Wed, Aug 16, 2017 at 02:42:39PM +0530, Aravinda Prasad wrote:
>>>> Memory error such as bit flips that cannot be corrected
>>>> by hardware are passed on to the kernel for handling.
>>>> If the memory address in error belongs to guest then
>>>> guest kernel is responsible for taking suitable action.
>>>> Patch [1] enhances KVM to exit guest with exit reason
>>>> set to KVM_EXIT_NMI in such cases.
>>>>
>>>> This patch handles KVM_EXIT_NMI exit. If the guest OS
>>>> has registered the machine check handling routine by
>>>> calling "ibm,nmi-register", then the handler builds
>>>> the error log and invokes the registered handler else
>>>> invokes the handler at 0x200.
>>>>
>>>> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html
>>>> 	(e20bbd3d and related commits)
>>>>
>>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>>>> ---
>>>>  hw/ppc/spapr.c       |    4 ++
>>>>  target/ppc/kvm.c     |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  target/ppc/kvm_ppc.h |   81 +++++++++++++++++++++++++++++++++++++++++++++++
>>>>  3 files changed, 171 insertions(+)
>>>>
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index 0bb2c4a..6cc3f69 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>>>> @@ -2346,6 +2346,10 @@ static void ppc_spapr_init(MachineState *machine)
>>>>          error_report("Could not get size of LPAR rtas '%s'", filename);
>>>>          exit(1);
>>>>      }
>>>> +
>>>> +    /* Resize blob to accommodate error log. */
>>>> +    spapr->rtas_size = RTAS_ERRLOG_OFFSET + sizeof(struct RtasMCELog);
>>>> +
>>>>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>>>>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>>>>          error_report("Could not load LPAR rtas '%s'", filename);
>>>> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
>>>> index 8571379..73f64ed 100644
>>>> --- a/target/ppc/kvm.c
>>>> +++ b/target/ppc/kvm.c
>>>> @@ -1782,6 +1782,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>>>>          ret = 0;
>>>>          break;
>>>>  
>>>> +    case KVM_EXIT_NMI:
>>>> +        DPRINTF("handle NMI exception\n");
>>>> +        ret = kvm_handle_nmi(cpu);
>>>> +        break;
>>>> +
>>>>      default:
>>>>          fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
>>>>          ret = -1;
>>>> @@ -2704,6 +2709,87 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
>>>>      return data & 0xffff;
>>>>  }
>>>>  
>>>> +int kvm_handle_nmi(PowerPCCPU *cpu)
>>>
>>> So you only handle NMIs with KVM.  Wouldn't it make sense to also
>>> handle them for TCG (where they can be triggered with the "nmi"
>>> command on the monitor).
>>
>> For TCG it is already handled in spapr_nmi() which ultimately branches
>> to guest's 0x200 vector. Even with KVM when KVM_CAP_PPC_FWNMI is not
>> enabled we branch to guest's 0x200 vector. Should TCG behave as if
>> KVM_CAP_PPC_FWNMI is not enabled?
> 
> No, we should implement the FWNMI feature for TCG, which means it
> needs to respect the address given by nmi-register.

I have couple of questions regarding supporting NMIs triggered through
"nmi" command from the monitor.

1. The FWNNMI builds an error log upon machine check that includes the
effective address that triggered the machine check and places the error
log in RTAS blob. Hence, we need to build error log for the NMI
triggered via the monitor "nmi" command. What should be the effective
address and the other values of the error log in this case? Should "nmi"
command be enhanced to take these arguments (as of now it does not take
any arguments)?

2. On Power architecture "nmi" monitor command triggers NMI on all the
CPUs, while the NMI due to hardware error is triggered only on the CPU
that accessed the bad memory region. Triggering NMI on all the CPUs
could be a problem as it cause kernel panic if the CPU was in kernel
space at that time or kills the process running on that CPU if it was in
userspace.
David Gibson Sept. 10, 2017, 3:22 a.m. UTC | #5
On Fri, Sep 08, 2017 at 01:39:37PM +0530, Aravinda Prasad wrote:
> 
> 
> On Wednesday 23 August 2017 02:09 PM, David Gibson wrote:
> > On Mon, Aug 21, 2017 at 06:00:52PM +0530, Aravinda Prasad wrote:
> >>
> >>
> >> On Thursday 17 August 2017 07:27 AM, David Gibson wrote:
> >>> On Wed, Aug 16, 2017 at 02:42:39PM +0530, Aravinda Prasad wrote:
> >>>> Memory error such as bit flips that cannot be corrected
> >>>> by hardware are passed on to the kernel for handling.
> >>>> If the memory address in error belongs to guest then
> >>>> guest kernel is responsible for taking suitable action.
> >>>> Patch [1] enhances KVM to exit guest with exit reason
> >>>> set to KVM_EXIT_NMI in such cases.
> >>>>
> >>>> This patch handles KVM_EXIT_NMI exit. If the guest OS
> >>>> has registered the machine check handling routine by
> >>>> calling "ibm,nmi-register", then the handler builds
> >>>> the error log and invokes the registered handler else
> >>>> invokes the handler at 0x200.
> >>>>
> >>>> [1] https://www.spinics.net/lists/kvm-ppc/msg12637.html
> >>>> 	(e20bbd3d and related commits)
> >>>>
> >>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >>>> ---
> >>>>  hw/ppc/spapr.c       |    4 ++
> >>>>  target/ppc/kvm.c     |   86 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  target/ppc/kvm_ppc.h |   81 +++++++++++++++++++++++++++++++++++++++++++++++
> >>>>  3 files changed, 171 insertions(+)
> >>>>
> >>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>>> index 0bb2c4a..6cc3f69 100644
> >>>> --- a/hw/ppc/spapr.c
> >>>> +++ b/hw/ppc/spapr.c
> >>>> @@ -2346,6 +2346,10 @@ static void ppc_spapr_init(MachineState *machine)
> >>>>          error_report("Could not get size of LPAR rtas '%s'", filename);
> >>>>          exit(1);
> >>>>      }
> >>>> +
> >>>> +    /* Resize blob to accommodate error log. */
> >>>> +    spapr->rtas_size = RTAS_ERRLOG_OFFSET + sizeof(struct RtasMCELog);
> >>>> +
> >>>>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
> >>>>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
> >>>>          error_report("Could not load LPAR rtas '%s'", filename);
> >>>> diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
> >>>> index 8571379..73f64ed 100644
> >>>> --- a/target/ppc/kvm.c
> >>>> +++ b/target/ppc/kvm.c
> >>>> @@ -1782,6 +1782,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
> >>>>          ret = 0;
> >>>>          break;
> >>>>  
> >>>> +    case KVM_EXIT_NMI:
> >>>> +        DPRINTF("handle NMI exception\n");
> >>>> +        ret = kvm_handle_nmi(cpu);
> >>>> +        break;
> >>>> +
> >>>>      default:
> >>>>          fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
> >>>>          ret = -1;
> >>>> @@ -2704,6 +2709,87 @@ int kvm_arch_msi_data_to_gsi(uint32_t data)
> >>>>      return data & 0xffff;
> >>>>  }
> >>>>  
> >>>> +int kvm_handle_nmi(PowerPCCPU *cpu)
> >>>
> >>> So you only handle NMIs with KVM.  Wouldn't it make sense to also
> >>> handle them for TCG (where they can be triggered with the "nmi"
> >>> command on the monitor).
> >>
> >> For TCG it is already handled in spapr_nmi() which ultimately branches
> >> to guest's 0x200 vector. Even with KVM when KVM_CAP_PPC_FWNMI is not
> >> enabled we branch to guest's 0x200 vector. Should TCG behave as if
> >> KVM_CAP_PPC_FWNMI is not enabled?
> > 
> > No, we should implement the FWNMI feature for TCG, which means it
> > needs to respect the address given by nmi-register.
> 
> I have couple of questions regarding supporting NMIs triggered through
> "nmi" command from the monitor.
> 
> 1. The FWNNMI builds an error log upon machine check that includes the
> effective address that triggered the machine check and places the error
> log in RTAS blob. Hence, we need to build error log for the NMI
> triggered via the monitor "nmi" command. What should be the effective
> address and the other values of the error log in this case? Should "nmi"
> command be enhanced to take these arguments (as of now it does not take
> any arguments)?

Ah!  That's something I hadn't realised - sounds like FWNMI is only
designed to handle synchronous machine checks, whereas the monitor
command (obviously) generates asynchronous checks.

In that case it's probably reasonable that the monitor command doesn't
go through the FWNMI path.  However, it would be nice to make it a bit
more obvious, by updating some of the comments and commit messages in
the FWNMI code to emphasise that it's only for synchronous checks.

> 2. On Power architecture "nmi" monitor command triggers NMI on all the
> CPUs, while the NMI due to hardware error is triggered only on the CPU
> that accessed the bad memory region. Triggering NMI on all the CPUs
> could be a problem as it cause kernel panic if the CPU was in kernel
> space at that time or kills the process running on that CPU if it was in
> userspace.
>
diff mbox

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 0bb2c4a..6cc3f69 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2346,6 +2346,10 @@  static void ppc_spapr_init(MachineState *machine)
         error_report("Could not get size of LPAR rtas '%s'", filename);
         exit(1);
     }
+
+    /* Resize blob to accommodate error log. */
+    spapr->rtas_size = RTAS_ERRLOG_OFFSET + sizeof(struct RtasMCELog);
+
     spapr->rtas_blob = g_malloc(spapr->rtas_size);
     if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
         error_report("Could not load LPAR rtas '%s'", filename);
diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c
index 8571379..73f64ed 100644
--- a/target/ppc/kvm.c
+++ b/target/ppc/kvm.c
@@ -1782,6 +1782,11 @@  int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
         ret = 0;
         break;
 
+    case KVM_EXIT_NMI:
+        DPRINTF("handle NMI exception\n");
+        ret = kvm_handle_nmi(cpu);
+        break;
+
     default:
         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
         ret = -1;
@@ -2704,6 +2709,87 @@  int kvm_arch_msi_data_to_gsi(uint32_t data)
     return data & 0xffff;
 }
 
+int kvm_handle_nmi(PowerPCCPU *cpu)
+{
+    struct RtasMCELog mc_log;
+    CPUPPCState *env = &cpu->env;
+    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    target_ulong msr = 0;
+
+    cpu_synchronize_state(CPU(cpu));
+
+    /*
+     * Properly set bits in MSR before we invoke the handler.
+     * SRR0/1, DAR and DSISR are properly set by KVM
+     */
+    if (!(*pcc->interrupts_big_endian)(cpu)) {
+        msr |= (1ULL << MSR_LE);
+    }
+
+    if (env->msr && (1ULL << MSR_SF)) {
+        msr |= (1ULL << MSR_SF);
+    }
+
+    msr |= (1ULL << MSR_ME);
+    env->msr = msr;
+
+    if (!spapr->guest_machine_check_addr) {
+        /*
+         * If OS has not registered with "ibm,nmi-register"
+         * jump to 0x200
+         */
+        env->nip = 0x200;
+        return 0;
+    }
+
+    while (spapr->mc_in_progress) {
+        /*
+         * Check whether the same CPU got machine check error
+         * while still handling the mc error (i.e., before
+         * that CPU called "ibm,nmi-interlock"
+         */
+        if (spapr->mc_cpu == cpu->cpu_dt_id) {
+            qemu_system_guest_panicked(NULL);
+        }
+        qemu_cond_wait_iothread(&spapr->mc_delivery_cond);
+    }
+    spapr->mc_in_progress = true;
+    spapr->mc_cpu = cpu->cpu_dt_id;
+
+    /* Set error log fields */
+    mc_log.r3 = env->gpr[3];
+    mc_log.err_log.byte0 = 0;
+    mc_log.err_log.byte1 =
+        (RTAS_SEVERITY_ERROR_SYNC << RTAS_ELOG_SEVERITY_SHIFT);
+    mc_log.err_log.byte1 |=
+        (RTAS_DISP_NOT_RECOVERED << RTAS_ELOG_DISPOSITION_SHIFT);
+    mc_log.err_log.byte2 =
+        (RTAS_INITIATOR_MEMORY << RTAS_ELOG_INITIATOR_SHIFT);
+    mc_log.err_log.byte2 |= RTAS_TARGET_MEMORY;
+
+    if (env->spr[SPR_DSISR] & P7_DSISR_MC_UE) {
+        mc_log.err_log.byte3 = RTAS_TYPE_ECC_UNCORR;
+    } else {
+        mc_log.err_log.byte3 = 0;
+    }
+
+    /* Handle all Host/Guest LE/BE combinations */
+    if (env->msr & (1ULL << MSR_LE)) {
+        mc_log.r3 = cpu_to_le64(mc_log.r3);
+    } else {
+        mc_log.r3 = cpu_to_be64(mc_log.r3);
+    }
+
+    cpu_physical_memory_write(spapr->rtas_addr + RTAS_ERRLOG_OFFSET,
+                              &mc_log, sizeof(mc_log));
+
+    env->nip = spapr->guest_machine_check_addr;
+    env->gpr[3] = spapr->rtas_addr + RTAS_ERRLOG_OFFSET;
+
+    return 0;
+}
+
 int kvmppc_enable_hwrng(void)
 {
     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) {
diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h
index 6bc6fb3..bc8e3ce 100644
--- a/target/ppc/kvm_ppc.h
+++ b/target/ppc/kvm_ppc.h
@@ -70,6 +70,87 @@  void kvmppc_update_sdr1(target_ulong sdr1);
 
 bool kvmppc_is_mem_backend_page_size_ok(const char *obj_path);
 
+int kvm_handle_nmi(PowerPCCPU *cpu);
+
+/* Offset from rtas-base where error log is placed */
+#define RTAS_ERRLOG_OFFSET       0x200
+
+#define RTAS_ELOG_SEVERITY_SHIFT         0x5
+#define RTAS_ELOG_DISPOSITION_SHIFT      0x3
+#define RTAS_ELOG_INITIATOR_SHIFT        0x4
+
+/*
+ * Only required RTAS event severity, disposition, initiator
+ * target and type are copied from arch/powerpc/include/asm/rtas.h
+ */
+
+/* RTAS event severity */
+#define RTAS_SEVERITY_ERROR_SYNC    0x3
+
+/* RTAS event disposition */
+#define RTAS_DISP_NOT_RECOVERED     0x2
+
+/* RTAS event initiator */
+#define RTAS_INITIATOR_MEMORY       0x4
+
+/* RTAS event target */
+#define RTAS_TARGET_MEMORY          0x4
+
+/* RTAS event type */
+#define RTAS_TYPE_ECC_UNCORR        0x09
+
+/*
+ * Currently KVM only passes on the uncorrected machine
+ * check memory error to guest. Other machine check errors
+ * such as SLB multi-hit and TLB multi-hit are recovered
+ * in KVM and are not passed on to guest.
+ *
+ * DSISR Bit for uncorrected machine check error. Based
+ * on arch/powerpc/include/asm/mce.h
+ */
+#define PPC_BIT(bit)                (0x8000000000000000ULL >> bit)
+#define P7_DSISR_MC_UE              (PPC_BIT(48))  /* P8 too */
+
+/* Adopted from kernel source arch/powerpc/include/asm/rtas.h */
+struct rtas_error_log {
+    /* Byte 0 */
+    uint8_t     byte0;          /* Architectural version */
+
+    /* Byte 1 */
+    uint8_t     byte1;
+    /* XXXXXXXX
+     * XXX      3: Severity level of error
+     *    XX    2: Degree of recovery
+     *      X   1: Extended log present?
+     *       XX 2: Reserved
+     */
+
+    /* Byte 2 */
+    uint8_t     byte2;
+    /* XXXXXXXX
+     * XXXX     4: Initiator of event
+     *     XXXX 4: Target of failed operation
+     */
+    uint8_t     byte3;          /* General event or error*/
+    __be32      extended_log_length;    /* length in bytes */
+    unsigned char   buffer[1];      /* Start of extended log */
+                                /* Variable length.      */
+};
+
+/*
+ * Data format in RTAS-Blob
+ *
+ * This structure contains error information related to Machine
+ * Check exception. This is filled up and copied to rtas-blob
+ * upon machine check exception. The address of rtas-blob is
+ * passed on to OS registered machine check notification
+ * routines upon machine check exception
+ */
+struct RtasMCELog {
+    target_ulong r3;
+    struct rtas_error_log err_log;
+};
+
 #else
 
 static inline uint32_t kvmppc_get_tbfreq(void)