diff mbox series

[v8,4/6] target/ppc: Build rtas error log upon an MCE

Message ID 155591660602.20338.9804738040071843715.stgit@aravinda
State New
Headers show
Series target-ppc/spapr: Add FWNMI support in QEMU for PowerKVM guests | expand

Commit Message

Aravinda Prasad April 22, 2019, 7:03 a.m. UTC
Upon a machine check exception (MCE) in a guest address space,
KVM causes a guest exit to enable QEMU to build and pass the
error to the guest in the PAPR defined rtas error log format.

This patch builds the rtas error log, copies it to the rtas_addr
and then invokes the guest registered machine check handler. The
handler in the guest takes suitable action(s) depending on the type
and criticality of the error. For example, if an error is
unrecoverable memory corruption in an application inside the
guest, then the guest kernel sends a SIGBUS to the application.
For recoverable errors, the guest performs recovery actions and
logs the error.

Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
---
 hw/ppc/spapr.c         |    4 +
 hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/hw/ppc/spapr.h |    4 +
 3 files changed, 253 insertions(+)

Comments

Fabiano Rosas April 23, 2019, 2:38 p.m. UTC | #1
Aravinda Prasad <aravinda@linux.vnet.ibm.com> writes:

> +    /*
> +     * Properly set bits in MSR before we invoke the handler.
> +     * SRR0/1, DAR and DSISR are properly set by KVM
> +     */
> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
> +        msr |= (1ULL << MSR_LE);
> +    }
> +
> +    if (env->msr && (1ULL << MSR_SF)) {

Don't you mean & instead of &&?

> +        msr |= (1ULL << MSR_SF);
> +    }
Aravinda Prasad April 24, 2019, 4:51 a.m. UTC | #2
On Tuesday 23 April 2019 08:08 PM, Fabiano Rosas wrote:
> Aravinda Prasad <aravinda@linux.vnet.ibm.com> writes:
> 
>> +    /*
>> +     * Properly set bits in MSR before we invoke the handler.
>> +     * SRR0/1, DAR and DSISR are properly set by KVM
>> +     */
>> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
>> +        msr |= (1ULL << MSR_LE);
>> +    }
>> +
>> +    if (env->msr && (1ULL << MSR_SF)) {
> 
> Don't you mean & instead of &&?

Ah.. yes.. Thanks for pointing out.

> 
>> +        msr |= (1ULL << MSR_SF);
>> +    }
> 
>
David Gibson May 10, 2019, 6:42 a.m. UTC | #3
On Mon, Apr 22, 2019 at 12:33:26PM +0530, Aravinda Prasad wrote:
> Upon a machine check exception (MCE) in a guest address space,
> KVM causes a guest exit to enable QEMU to build and pass the
> error to the guest in the PAPR defined rtas error log format.
> 
> This patch builds the rtas error log, copies it to the rtas_addr
> and then invokes the guest registered machine check handler. The
> handler in the guest takes suitable action(s) depending on the type
> and criticality of the error. For example, if an error is
> unrecoverable memory corruption in an application inside the
> guest, then the guest kernel sends a SIGBUS to the application.
> For recoverable errors, the guest performs recovery actions and
> logs the error.
> 
> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c         |    4 +
>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h |    4 +
>  3 files changed, 253 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 2779efe..ffd1715 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
>          error_report("Could not get size of LPAR rtas '%s'", filename);
>          exit(1);
>      }
> +
> +    /* Resize blob to accommodate error log. */
> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
> +
>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>          error_report("Could not load LPAR rtas '%s'", filename);
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 9922a23..4032db0 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -212,6 +212,106 @@ struct hp_extended_log {
>      struct rtas_event_log_v6_hp hp;
>  } QEMU_PACKED;
>  
> +struct rtas_event_log_v6_mc {
> +#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
> +    struct rtas_event_log_v6_section_header hdr;
> +    uint32_t fru_id;
> +    uint32_t proc_id;
> +    uint8_t error_type;
> +#define RTAS_LOG_V6_MC_TYPE_UE                           0
> +#define RTAS_LOG_V6_MC_TYPE_SLB                          1
> +#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
> +#define RTAS_LOG_V6_MC_TYPE_TLB                          4
> +#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
> +#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
> +    uint8_t sub_err_type;
> +#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
> +#define RTAS_LOG_V6_MC_UE_IFETCH                         1
> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
> +#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
> +#define RTAS_LOG_V6_MC_SLB_PARITY                        0
> +#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
> +#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
> +#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
> +#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
> +#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
> +#define RTAS_LOG_V6_MC_TLB_PARITY                        1
> +#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
> +#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
> +    uint8_t reserved_1[6];
> +    uint64_t effective_address;
> +    uint64_t logical_address;
> +} QEMU_PACKED;
> +
> +struct mc_extended_log {
> +    struct rtas_event_log_v6 v6hdr;
> +    struct rtas_event_log_v6_mc mc;
> +} QEMU_PACKED;
> +
> +struct MC_ierror_table {
> +    unsigned long srr1_mask;
> +    unsigned long srr1_value;
> +    bool nip_valid; /* nip is a valid indicator of faulting address */
> +    uint8_t error_type;
> +    uint8_t error_subtype;
> +    unsigned int initiator;
> +    unsigned int severity;
> +};
> +
> +static const struct MC_ierror_table mc_ierror_table[] = {
> +{ 0x00000000081c0000, 0x0000000000040000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000080000, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x00000000000c0000, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000100000, true,
> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000140000, true,
> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000180000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0, 0, 0, 0, 0, 0 } };
> +
> +struct MC_derror_table {
> +    unsigned long dsisr_value;
> +    bool dar_valid; /* dar is a valid indicator of faulting address */
> +    uint8_t error_type;
> +    uint8_t error_subtype;
> +    unsigned int initiator;
> +    unsigned int severity;
> +};
> +
> +static const struct MC_derror_table mc_derror_table[] = {
> +{ 0x00008000, false,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00004000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000800, true,
> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000400, true,
> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000080, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000100, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0, false, 0, 0, 0, 0 } };
> +
> +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
> +
>  typedef enum EventClass {
>      EVENT_CLASS_INTERNAL_ERRORS     = 0,
>      EVENT_CLASS_EPOW                = 1,
> @@ -620,6 +720,147 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
>                              RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
>  }
>  
> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
> +{
> +    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
> +    return RTAS_ERROR_LOG_MAX;
> +}
> +
> +static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
> +                                        struct mc_extended_log *ext_elog)
> +{
> +    int i;
> +    CPUPPCState *env = &cpu->env;
> +    uint32_t summary;
> +    uint64_t dsisr = env->spr[SPR_DSISR];
> +
> +    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
> +    if (recovered) {
> +        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
> +    } else {
> +        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
> +    }
> +
> +    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
> +        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
> +            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
> +                continue;
> +            }
> +
> +            ext_elog->mc.error_type = mc_derror_table[i].error_type;
> +            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
> +            if (mc_derror_table[i].dar_valid) {
> +                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
> +            }
> +
> +            summary |= mc_derror_table[i].initiator
> +                        | mc_derror_table[i].severity;
> +
> +            return summary;
> +        }
> +    } else {
> +        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
> +            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
> +                    mc_ierror_table[i].srr1_value) {
> +                continue;
> +            }
> +
> +            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
> +            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
> +            if (mc_ierror_table[i].nip_valid) {
> +                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
> +            }
> +
> +            summary |= mc_ierror_table[i].initiator
> +                        | mc_ierror_table[i].severity;
> +
> +            return summary;
> +        }
> +    }
> +
> +    summary |= RTAS_LOG_INITIATOR_CPU;
> +    return summary;
> +}
> +
> +static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    CPUState *cs = CPU(cpu);
> +    uint64_t rtas_addr;
> +    CPUPPCState *env = &cpu->env;
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +    target_ulong r3, msr = 0;
> +    struct rtas_error_log log;
> +    struct mc_extended_log *ext_elog;
> +    uint32_t summary;
> +
> +    /*
> +     * Properly set bits in MSR before we invoke the handler.
> +     * SRR0/1, DAR and DSISR are properly set by KVM
> +     */
> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
> +        msr |= (1ULL << MSR_LE);
> +    }
> +
> +    if (env->msr && (1ULL << MSR_SF)) {
> +        msr |= (1ULL << MSR_SF);
> +    }
> +
> +    msr |= (1ULL << MSR_ME);
> +
> +    if (spapr->guest_machine_check_addr == -1) {
> +        /*
> +         * This implies that we have hit a machine check between system
> +         * reset and "ibm,nmi-register". Fall back to the old machine
> +         * check behavior in such cases.
> +         */
> +        env->spr[SPR_SRR0] = env->nip;
> +        env->spr[SPR_SRR1] = env->msr;
> +        env->msr = msr;
> +        env->nip = 0x200;
> +        return;
> +    }
> +
> +    ext_elog = g_malloc0(sizeof(struct mc_extended_log));
> +    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
> +
> +    log.summary = cpu_to_be32(summary);
> +    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
> +
> +    /* r3 should be in BE always */
> +    r3 = cpu_to_be64(env->gpr[3]);
> +    env->msr = msr;
> +
> +    spapr_init_v6hdr(&ext_elog->v6hdr);
> +    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
> +    ext_elog->mc.hdr.section_length =
> +                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
> +    ext_elog->mc.hdr.section_version = 1;
> +
> +    /* get rtas addr from fdt */
> +    rtas_addr = spapr_get_rtas_addr();
> +    if (!rtas_addr) {
> +        /* Unable to fetch rtas_addr. Hence reset the guest */
> +        ppc_cpu_do_system_reset(cs);
> +    }
> +
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
> +                              &log, sizeof(log));
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
> +                              sizeof(log), ext_elog,
> +                              sizeof(struct mc_extended_log));
> +
> +    /* Save gpr[3] in the guest endian mode */
> +    if ((*pcc->interrupts_big_endian)(cpu)) {
> +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);

I don't think this is right.  AIUI env->gpr[] are all stored in *host*
endianness (for ease of doing arithmetic).

> +    } else {
> +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
> +    }
> +
> +    env->nip = spapr->guest_machine_check_addr;
> +}
> +
>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>  {
>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> @@ -640,6 +881,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>          }
>      }
>      spapr->mc_status = cpu->vcpu_id;
> +
> +    spapr_mce_dispatch_elog(cpu, recovered);
> +
> +    return;
>  }
>  
>  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index f7204d0..03f34bf 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -661,6 +661,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
>  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
>  
> +/* Offset from rtas-base where error log is placed */
> +#define RTAS_ERRLOG_OFFSET       0x25

Is this offset PAPR defined, or chosen here?  Using an entirely
unaliged (odd) address seems a very strange choice.

> +
>  static inline uint64_t ppc64_phys_to_real(uint64_t addr)
>  {
>      return addr & ~0xF000000000000000ULL;
> @@ -798,6 +801,7 @@ int spapr_max_server_number(SpaprMachineState *spapr);
>  void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
>                        uint64_t pte0, uint64_t pte1);
>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
>  
>  /* DRC callbacks. */
>  void spapr_core_release(DeviceState *dev);
>
Aravinda Prasad May 10, 2019, 7:05 a.m. UTC | #4
On Friday 10 May 2019 12:12 PM, David Gibson wrote:
> On Mon, Apr 22, 2019 at 12:33:26PM +0530, Aravinda Prasad wrote:
>> Upon a machine check exception (MCE) in a guest address space,
>> KVM causes a guest exit to enable QEMU to build and pass the
>> error to the guest in the PAPR defined rtas error log format.
>>
>> This patch builds the rtas error log, copies it to the rtas_addr
>> and then invokes the guest registered machine check handler. The
>> handler in the guest takes suitable action(s) depending on the type
>> and criticality of the error. For example, if an error is
>> unrecoverable memory corruption in an application inside the
>> guest, then the guest kernel sends a SIGBUS to the application.
>> For recoverable errors, the guest performs recovery actions and
>> logs the error.
>>
>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>> ---
>>  hw/ppc/spapr.c         |    4 +
>>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/spapr.h |    4 +
>>  3 files changed, 253 insertions(+)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 2779efe..ffd1715 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
>>          error_report("Could not get size of LPAR rtas '%s'", filename);
>>          exit(1);
>>      }
>> +
>> +    /* Resize blob to accommodate error log. */
>> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
>> +
>>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>>          error_report("Could not load LPAR rtas '%s'", filename);
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index 9922a23..4032db0 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -212,6 +212,106 @@ struct hp_extended_log {
>>      struct rtas_event_log_v6_hp hp;
>>  } QEMU_PACKED;
>>  
>> +struct rtas_event_log_v6_mc {
>> +#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
>> +    struct rtas_event_log_v6_section_header hdr;
>> +    uint32_t fru_id;
>> +    uint32_t proc_id;
>> +    uint8_t error_type;
>> +#define RTAS_LOG_V6_MC_TYPE_UE                           0
>> +#define RTAS_LOG_V6_MC_TYPE_SLB                          1
>> +#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
>> +#define RTAS_LOG_V6_MC_TYPE_TLB                          4
>> +#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
>> +#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
>> +    uint8_t sub_err_type;
>> +#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
>> +#define RTAS_LOG_V6_MC_UE_IFETCH                         1
>> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
>> +#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
>> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
>> +#define RTAS_LOG_V6_MC_SLB_PARITY                        0
>> +#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
>> +#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
>> +#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
>> +#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
>> +#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
>> +#define RTAS_LOG_V6_MC_TLB_PARITY                        1
>> +#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
>> +#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
>> +    uint8_t reserved_1[6];
>> +    uint64_t effective_address;
>> +    uint64_t logical_address;
>> +} QEMU_PACKED;
>> +
>> +struct mc_extended_log {
>> +    struct rtas_event_log_v6 v6hdr;
>> +    struct rtas_event_log_v6_mc mc;
>> +} QEMU_PACKED;
>> +
>> +struct MC_ierror_table {
>> +    unsigned long srr1_mask;
>> +    unsigned long srr1_value;
>> +    bool nip_valid; /* nip is a valid indicator of faulting address */
>> +    uint8_t error_type;
>> +    uint8_t error_subtype;
>> +    unsigned int initiator;
>> +    unsigned int severity;
>> +};
>> +
>> +static const struct MC_ierror_table mc_ierror_table[] = {
>> +{ 0x00000000081c0000, 0x0000000000040000, true,
>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x0000000000080000, true,
>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x00000000000c0000, true,
>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x0000000000100000, true,
>> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x0000000000140000, true,
>> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000000081c0000, 0x0000000000180000, true,
>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0, 0, 0, 0, 0, 0 } };
>> +
>> +struct MC_derror_table {
>> +    unsigned long dsisr_value;
>> +    bool dar_valid; /* dar is a valid indicator of faulting address */
>> +    uint8_t error_type;
>> +    uint8_t error_subtype;
>> +    unsigned int initiator;
>> +    unsigned int severity;
>> +};
>> +
>> +static const struct MC_derror_table mc_derror_table[] = {
>> +{ 0x00008000, false,
>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00004000, true,
>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000800, true,
>> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000400, true,
>> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000080, true,
>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0x00000100, true,
>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>> +{ 0, false, 0, 0, 0, 0 } };
>> +
>> +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
>> +
>>  typedef enum EventClass {
>>      EVENT_CLASS_INTERNAL_ERRORS     = 0,
>>      EVENT_CLASS_EPOW                = 1,
>> @@ -620,6 +720,147 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
>>                              RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
>>  }
>>  
>> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
>> +{
>> +    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
>> +    return RTAS_ERROR_LOG_MAX;
>> +}
>> +
>> +static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
>> +                                        struct mc_extended_log *ext_elog)
>> +{
>> +    int i;
>> +    CPUPPCState *env = &cpu->env;
>> +    uint32_t summary;
>> +    uint64_t dsisr = env->spr[SPR_DSISR];
>> +
>> +    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
>> +    if (recovered) {
>> +        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
>> +    } else {
>> +        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
>> +    }
>> +
>> +    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
>> +        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
>> +            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
>> +                continue;
>> +            }
>> +
>> +            ext_elog->mc.error_type = mc_derror_table[i].error_type;
>> +            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
>> +            if (mc_derror_table[i].dar_valid) {
>> +                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
>> +            }
>> +
>> +            summary |= mc_derror_table[i].initiator
>> +                        | mc_derror_table[i].severity;
>> +
>> +            return summary;
>> +        }
>> +    } else {
>> +        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
>> +            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
>> +                    mc_ierror_table[i].srr1_value) {
>> +                continue;
>> +            }
>> +
>> +            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
>> +            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
>> +            if (mc_ierror_table[i].nip_valid) {
>> +                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
>> +            }
>> +
>> +            summary |= mc_ierror_table[i].initiator
>> +                        | mc_ierror_table[i].severity;
>> +
>> +            return summary;
>> +        }
>> +    }
>> +
>> +    summary |= RTAS_LOG_INITIATOR_CPU;
>> +    return summary;
>> +}
>> +
>> +static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>> +{
>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> +    CPUState *cs = CPU(cpu);
>> +    uint64_t rtas_addr;
>> +    CPUPPCState *env = &cpu->env;
>> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>> +    target_ulong r3, msr = 0;
>> +    struct rtas_error_log log;
>> +    struct mc_extended_log *ext_elog;
>> +    uint32_t summary;
>> +
>> +    /*
>> +     * Properly set bits in MSR before we invoke the handler.
>> +     * SRR0/1, DAR and DSISR are properly set by KVM
>> +     */
>> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
>> +        msr |= (1ULL << MSR_LE);
>> +    }
>> +
>> +    if (env->msr && (1ULL << MSR_SF)) {
>> +        msr |= (1ULL << MSR_SF);
>> +    }
>> +
>> +    msr |= (1ULL << MSR_ME);
>> +
>> +    if (spapr->guest_machine_check_addr == -1) {
>> +        /*
>> +         * This implies that we have hit a machine check between system
>> +         * reset and "ibm,nmi-register". Fall back to the old machine
>> +         * check behavior in such cases.
>> +         */
>> +        env->spr[SPR_SRR0] = env->nip;
>> +        env->spr[SPR_SRR1] = env->msr;
>> +        env->msr = msr;
>> +        env->nip = 0x200;
>> +        return;
>> +    }
>> +
>> +    ext_elog = g_malloc0(sizeof(struct mc_extended_log));
>> +    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
>> +
>> +    log.summary = cpu_to_be32(summary);
>> +    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
>> +
>> +    /* r3 should be in BE always */
>> +    r3 = cpu_to_be64(env->gpr[3]);
>> +    env->msr = msr;
>> +
>> +    spapr_init_v6hdr(&ext_elog->v6hdr);
>> +    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
>> +    ext_elog->mc.hdr.section_length =
>> +                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
>> +    ext_elog->mc.hdr.section_version = 1;
>> +
>> +    /* get rtas addr from fdt */
>> +    rtas_addr = spapr_get_rtas_addr();
>> +    if (!rtas_addr) {
>> +        /* Unable to fetch rtas_addr. Hence reset the guest */
>> +        ppc_cpu_do_system_reset(cs);
>> +    }
>> +
>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
>> +                              &log, sizeof(log));
>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
>> +                              sizeof(log), ext_elog,
>> +                              sizeof(struct mc_extended_log));
>> +
>> +    /* Save gpr[3] in the guest endian mode */
>> +    if ((*pcc->interrupts_big_endian)(cpu)) {
>> +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
> 
> I don't think this is right.  AIUI env->gpr[] are all stored in *host*
> endianness (for ease of doing arithmetic).

env-gpr[3] is later used by guest to fetch the RTAS log. My guess is
that we will not do an endianness change of all the gprs during a switch
from host to guest (that will be costly). But let me cross check.

> 
>> +    } else {
>> +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
>> +    }
>> +
>> +    env->nip = spapr->guest_machine_check_addr;
>> +}
>> +
>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>  {
>>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>> @@ -640,6 +881,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>          }
>>      }
>>      spapr->mc_status = cpu->vcpu_id;
>> +
>> +    spapr_mce_dispatch_elog(cpu, recovered);
>> +
>> +    return;
>>  }
>>  
>>  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index f7204d0..03f34bf 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -661,6 +661,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>>  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
>>  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
>>  
>> +/* Offset from rtas-base where error log is placed */
>> +#define RTAS_ERRLOG_OFFSET       0x25
> 
> Is this offset PAPR defined, or chosen here?  Using an entirely
> unaliged (odd) address seems a very strange choice.

This is not PAPR defined. I will make it 0x30. Or do you prefer any
other offset?

Regards,
Aravinda

> 
>> +
>>  static inline uint64_t ppc64_phys_to_real(uint64_t addr)
>>  {
>>      return addr & ~0xF000000000000000ULL;
>> @@ -798,6 +801,7 @@ int spapr_max_server_number(SpaprMachineState *spapr);
>>  void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
>>                        uint64_t pte0, uint64_t pte1);
>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
>> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
>>  
>>  /* DRC callbacks. */
>>  void spapr_core_release(DeviceState *dev);
>>
>
David Gibson May 10, 2019, 9:52 a.m. UTC | #5
On Fri, May 10, 2019 at 12:35:13PM +0530, Aravinda Prasad wrote:
> 
> 
> On Friday 10 May 2019 12:12 PM, David Gibson wrote:
> > On Mon, Apr 22, 2019 at 12:33:26PM +0530, Aravinda Prasad wrote:
> >> Upon a machine check exception (MCE) in a guest address space,
> >> KVM causes a guest exit to enable QEMU to build and pass the
> >> error to the guest in the PAPR defined rtas error log format.
> >>
> >> This patch builds the rtas error log, copies it to the rtas_addr
> >> and then invokes the guest registered machine check handler. The
> >> handler in the guest takes suitable action(s) depending on the type
> >> and criticality of the error. For example, if an error is
> >> unrecoverable memory corruption in an application inside the
> >> guest, then the guest kernel sends a SIGBUS to the application.
> >> For recoverable errors, the guest performs recovery actions and
> >> logs the error.
> >>
> >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >> ---
> >>  hw/ppc/spapr.c         |    4 +
> >>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>  include/hw/ppc/spapr.h |    4 +
> >>  3 files changed, 253 insertions(+)
> >>
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index 2779efe..ffd1715 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
> >>          error_report("Could not get size of LPAR rtas '%s'", filename);
> >>          exit(1);
> >>      }
> >> +
> >> +    /* Resize blob to accommodate error log. */
> >> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
> >> +
> >>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
> >>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
> >>          error_report("Could not load LPAR rtas '%s'", filename);
> >> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> >> index 9922a23..4032db0 100644
> >> --- a/hw/ppc/spapr_events.c
> >> +++ b/hw/ppc/spapr_events.c
> >> @@ -212,6 +212,106 @@ struct hp_extended_log {
> >>      struct rtas_event_log_v6_hp hp;
> >>  } QEMU_PACKED;
> >>  
> >> +struct rtas_event_log_v6_mc {
> >> +#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
> >> +    struct rtas_event_log_v6_section_header hdr;
> >> +    uint32_t fru_id;
> >> +    uint32_t proc_id;
> >> +    uint8_t error_type;
> >> +#define RTAS_LOG_V6_MC_TYPE_UE                           0
> >> +#define RTAS_LOG_V6_MC_TYPE_SLB                          1
> >> +#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
> >> +#define RTAS_LOG_V6_MC_TYPE_TLB                          4
> >> +#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
> >> +#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
> >> +    uint8_t sub_err_type;
> >> +#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
> >> +#define RTAS_LOG_V6_MC_UE_IFETCH                         1
> >> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
> >> +#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
> >> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
> >> +#define RTAS_LOG_V6_MC_SLB_PARITY                        0
> >> +#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
> >> +#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
> >> +#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
> >> +#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
> >> +#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
> >> +#define RTAS_LOG_V6_MC_TLB_PARITY                        1
> >> +#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
> >> +#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
> >> +    uint8_t reserved_1[6];
> >> +    uint64_t effective_address;
> >> +    uint64_t logical_address;
> >> +} QEMU_PACKED;
> >> +
> >> +struct mc_extended_log {
> >> +    struct rtas_event_log_v6 v6hdr;
> >> +    struct rtas_event_log_v6_mc mc;
> >> +} QEMU_PACKED;
> >> +
> >> +struct MC_ierror_table {
> >> +    unsigned long srr1_mask;
> >> +    unsigned long srr1_value;
> >> +    bool nip_valid; /* nip is a valid indicator of faulting address */
> >> +    uint8_t error_type;
> >> +    uint8_t error_subtype;
> >> +    unsigned int initiator;
> >> +    unsigned int severity;
> >> +};
> >> +
> >> +static const struct MC_ierror_table mc_ierror_table[] = {
> >> +{ 0x00000000081c0000, 0x0000000000040000, true,
> >> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000000081c0000, 0x0000000000080000, true,
> >> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000000081c0000, 0x00000000000c0000, true,
> >> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000000081c0000, 0x0000000000100000, true,
> >> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000000081c0000, 0x0000000000140000, true,
> >> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000000081c0000, 0x0000000000180000, true,
> >> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0, 0, 0, 0, 0, 0 } };
> >> +
> >> +struct MC_derror_table {
> >> +    unsigned long dsisr_value;
> >> +    bool dar_valid; /* dar is a valid indicator of faulting address */
> >> +    uint8_t error_type;
> >> +    uint8_t error_subtype;
> >> +    unsigned int initiator;
> >> +    unsigned int severity;
> >> +};
> >> +
> >> +static const struct MC_derror_table mc_derror_table[] = {
> >> +{ 0x00008000, false,
> >> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00004000, true,
> >> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000800, true,
> >> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000400, true,
> >> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000080, true,
> >> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0x00000100, true,
> >> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> >> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> >> +{ 0, false, 0, 0, 0, 0 } };
> >> +
> >> +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
> >> +
> >>  typedef enum EventClass {
> >>      EVENT_CLASS_INTERNAL_ERRORS     = 0,
> >>      EVENT_CLASS_EPOW                = 1,
> >> @@ -620,6 +720,147 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
> >>                              RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
> >>  }
> >>  
> >> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
> >> +{
> >> +    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
> >> +    return RTAS_ERROR_LOG_MAX;
> >> +}
> >> +
> >> +static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
> >> +                                        struct mc_extended_log *ext_elog)
> >> +{
> >> +    int i;
> >> +    CPUPPCState *env = &cpu->env;
> >> +    uint32_t summary;
> >> +    uint64_t dsisr = env->spr[SPR_DSISR];
> >> +
> >> +    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
> >> +    if (recovered) {
> >> +        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
> >> +    } else {
> >> +        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
> >> +    }
> >> +
> >> +    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
> >> +        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
> >> +            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
> >> +                continue;
> >> +            }
> >> +
> >> +            ext_elog->mc.error_type = mc_derror_table[i].error_type;
> >> +            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
> >> +            if (mc_derror_table[i].dar_valid) {
> >> +                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
> >> +            }
> >> +
> >> +            summary |= mc_derror_table[i].initiator
> >> +                        | mc_derror_table[i].severity;
> >> +
> >> +            return summary;
> >> +        }
> >> +    } else {
> >> +        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
> >> +            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
> >> +                    mc_ierror_table[i].srr1_value) {
> >> +                continue;
> >> +            }
> >> +
> >> +            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
> >> +            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
> >> +            if (mc_ierror_table[i].nip_valid) {
> >> +                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
> >> +            }
> >> +
> >> +            summary |= mc_ierror_table[i].initiator
> >> +                        | mc_ierror_table[i].severity;
> >> +
> >> +            return summary;
> >> +        }
> >> +    }
> >> +
> >> +    summary |= RTAS_LOG_INITIATOR_CPU;
> >> +    return summary;
> >> +}
> >> +
> >> +static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> >> +{
> >> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >> +    CPUState *cs = CPU(cpu);
> >> +    uint64_t rtas_addr;
> >> +    CPUPPCState *env = &cpu->env;
> >> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> >> +    target_ulong r3, msr = 0;
> >> +    struct rtas_error_log log;
> >> +    struct mc_extended_log *ext_elog;
> >> +    uint32_t summary;
> >> +
> >> +    /*
> >> +     * Properly set bits in MSR before we invoke the handler.
> >> +     * SRR0/1, DAR and DSISR are properly set by KVM
> >> +     */
> >> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
> >> +        msr |= (1ULL << MSR_LE);
> >> +    }
> >> +
> >> +    if (env->msr && (1ULL << MSR_SF)) {
> >> +        msr |= (1ULL << MSR_SF);
> >> +    }
> >> +
> >> +    msr |= (1ULL << MSR_ME);
> >> +
> >> +    if (spapr->guest_machine_check_addr == -1) {
> >> +        /*
> >> +         * This implies that we have hit a machine check between system
> >> +         * reset and "ibm,nmi-register". Fall back to the old machine
> >> +         * check behavior in such cases.
> >> +         */
> >> +        env->spr[SPR_SRR0] = env->nip;
> >> +        env->spr[SPR_SRR1] = env->msr;
> >> +        env->msr = msr;
> >> +        env->nip = 0x200;
> >> +        return;
> >> +    }
> >> +
> >> +    ext_elog = g_malloc0(sizeof(struct mc_extended_log));
> >> +    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
> >> +
> >> +    log.summary = cpu_to_be32(summary);
> >> +    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
> >> +
> >> +    /* r3 should be in BE always */
> >> +    r3 = cpu_to_be64(env->gpr[3]);
> >> +    env->msr = msr;
> >> +
> >> +    spapr_init_v6hdr(&ext_elog->v6hdr);
> >> +    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
> >> +    ext_elog->mc.hdr.section_length =
> >> +                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
> >> +    ext_elog->mc.hdr.section_version = 1;
> >> +
> >> +    /* get rtas addr from fdt */
> >> +    rtas_addr = spapr_get_rtas_addr();
> >> +    if (!rtas_addr) {
> >> +        /* Unable to fetch rtas_addr. Hence reset the guest */
> >> +        ppc_cpu_do_system_reset(cs);
> >> +    }
> >> +
> >> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
> >> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
> >> +                              &log, sizeof(log));
> >> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
> >> +                              sizeof(log), ext_elog,
> >> +                              sizeof(struct mc_extended_log));
> >> +
> >> +    /* Save gpr[3] in the guest endian mode */
> >> +    if ((*pcc->interrupts_big_endian)(cpu)) {
> >> +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
> > 
> > I don't think this is right.  AIUI env->gpr[] are all stored in *host*
> > endianness (for ease of doing arithmetic).
> 
> env-gpr[3] is later used by guest to fetch the RTAS log. My guess is
> that we will not do an endianness change of all the gprs during a switch
> from host to guest (that will be costly).

There's no need to "change endianness".  In TCG the host needs to do
arithmetic on the values and so they are in host endian.  With KVM the
env values are only synchronized when we enter/exit KVM and they're
going to registers, not memory and so have no endianness.

> But let me cross check.
> 
> > 
> >> +    } else {
> >> +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
> >> +    }
> >> +
> >> +    env->nip = spapr->guest_machine_check_addr;
> >> +}
> >> +
> >>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >>  {
> >>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> >> @@ -640,6 +881,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >>          }
> >>      }
> >>      spapr->mc_status = cpu->vcpu_id;
> >> +
> >> +    spapr_mce_dispatch_elog(cpu, recovered);
> >> +
> >> +    return;
> >>  }
> >>  
> >>  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index f7204d0..03f34bf 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -661,6 +661,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
> >>  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
> >>  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
> >>  
> >> +/* Offset from rtas-base where error log is placed */
> >> +#define RTAS_ERRLOG_OFFSET       0x25
> > 
> > Is this offset PAPR defined, or chosen here?  Using an entirely
> > unaliged (odd) address seems a very strange choice.
> 
> This is not PAPR defined. I will make it 0x30. Or do you prefer any
> other offset?

0x30 should be fine.
Aravinda Prasad May 13, 2019, 5 a.m. UTC | #6
On Friday 10 May 2019 03:22 PM, David Gibson wrote:
> On Fri, May 10, 2019 at 12:35:13PM +0530, Aravinda Prasad wrote:
>>
>>
>> On Friday 10 May 2019 12:12 PM, David Gibson wrote:
>>> On Mon, Apr 22, 2019 at 12:33:26PM +0530, Aravinda Prasad wrote:

[...]

>>>> +    /* Save gpr[3] in the guest endian mode */
>>>> +    if ((*pcc->interrupts_big_endian)(cpu)) {
>>>> +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
>>>
>>> I don't think this is right.  AIUI env->gpr[] are all stored in *host*
>>> endianness (for ease of doing arithmetic).
>>
>> env-gpr[3] is later used by guest to fetch the RTAS log. My guess is
>> that we will not do an endianness change of all the gprs during a switch
>> from host to guest (that will be costly).
> 
> There's no need to "change endianness".  In TCG the host needs to do
> arithmetic on the values and so they are in host endian.  With KVM the
> env values are only synchronized when we enter/exit KVM and they're
> going to registers, not memory and so have no endianness.

Ah.. ok.

> 
>> But let me cross check.
>>
>>>
>>>> +    } else {
>>>> +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
>>>> +    }
>>>> +
>>>> +    env->nip = spapr->guest_machine_check_addr;
>>>> +}
>>>> +
>>>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>>>  {
>>>>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>>> @@ -640,6 +881,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>>>          }
>>>>      }
>>>>      spapr->mc_status = cpu->vcpu_id;
>>>> +
>>>> +    spapr_mce_dispatch_elog(cpu, recovered);
>>>> +
>>>> +    return;
>>>>  }
>>>>  
>>>>  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index f7204d0..03f34bf 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -661,6 +661,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>>>>  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
>>>>  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
>>>>  
>>>> +/* Offset from rtas-base where error log is placed */
>>>> +#define RTAS_ERRLOG_OFFSET       0x25
>>>
>>> Is this offset PAPR defined, or chosen here?  Using an entirely
>>> unaliged (odd) address seems a very strange choice.
>>
>> This is not PAPR defined. I will make it 0x30. Or do you prefer any
>> other offset?
> 
> 0x30 should be fine.

ok..

>
Greg Kurz May 13, 2019, 11:30 a.m. UTC | #7
On Mon, 22 Apr 2019 12:33:26 +0530
Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:

> Upon a machine check exception (MCE) in a guest address space,
> KVM causes a guest exit to enable QEMU to build and pass the
> error to the guest in the PAPR defined rtas error log format.
> 
> This patch builds the rtas error log, copies it to the rtas_addr
> and then invokes the guest registered machine check handler. The
> handler in the guest takes suitable action(s) depending on the type
> and criticality of the error. For example, if an error is
> unrecoverable memory corruption in an application inside the
> guest, then the guest kernel sends a SIGBUS to the application.
> For recoverable errors, the guest performs recovery actions and
> logs the error.
> 
> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c         |    4 +
>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h |    4 +
>  3 files changed, 253 insertions(+)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 2779efe..ffd1715 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
>          error_report("Could not get size of LPAR rtas '%s'", filename);
>          exit(1);
>      }
> +
> +    /* Resize blob to accommodate error log. */
> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
> +

This is the only user for spapr_get_rtas_size(), which is trivial.
I suggest you simply open-code it here.

But also, spapr->rtas_size is a guest visible thing, "rtas-size" prop in the
DT. Since existing machine types don't do that, I guess we should only use
the new size if cap-fwnmi-mce=on for the sake of compatibility.

>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>          error_report("Could not load LPAR rtas '%s'", filename);
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 9922a23..4032db0 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -212,6 +212,106 @@ struct hp_extended_log {
>      struct rtas_event_log_v6_hp hp;
>  } QEMU_PACKED;
>  
> +struct rtas_event_log_v6_mc {

Even if the rest of the code in this file seems to ignore CODING_STYLE,
maybe it's time to start using CamelCase.

David ?

> +#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
> +    struct rtas_event_log_v6_section_header hdr;
> +    uint32_t fru_id;
> +    uint32_t proc_id;
> +    uint8_t error_type;
> +#define RTAS_LOG_V6_MC_TYPE_UE                           0
> +#define RTAS_LOG_V6_MC_TYPE_SLB                          1
> +#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
> +#define RTAS_LOG_V6_MC_TYPE_TLB                          4
> +#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
> +#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
> +    uint8_t sub_err_type;
> +#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
> +#define RTAS_LOG_V6_MC_UE_IFETCH                         1
> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
> +#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
> +#define RTAS_LOG_V6_MC_SLB_PARITY                        0
> +#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
> +#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
> +#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
> +#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
> +#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
> +#define RTAS_LOG_V6_MC_TLB_PARITY                        1
> +#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
> +#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
> +    uint8_t reserved_1[6];
> +    uint64_t effective_address;
> +    uint64_t logical_address;
> +} QEMU_PACKED;
> +
> +struct mc_extended_log {
> +    struct rtas_event_log_v6 v6hdr;
> +    struct rtas_event_log_v6_mc mc;
> +} QEMU_PACKED;
> +
> +struct MC_ierror_table {
> +    unsigned long srr1_mask;
> +    unsigned long srr1_value;
> +    bool nip_valid; /* nip is a valid indicator of faulting address */
> +    uint8_t error_type;
> +    uint8_t error_subtype;
> +    unsigned int initiator;
> +    unsigned int severity;
> +};
> +
> +static const struct MC_ierror_table mc_ierror_table[] = {
> +{ 0x00000000081c0000, 0x0000000000040000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000080000, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x00000000000c0000, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000100000, true,
> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000140000, true,
> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000000081c0000, 0x0000000000180000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0, 0, 0, 0, 0, 0 } };
> +
> +struct MC_derror_table {
> +    unsigned long dsisr_value;
> +    bool dar_valid; /* dar is a valid indicator of faulting address */
> +    uint8_t error_type;
> +    uint8_t error_subtype;
> +    unsigned int initiator;
> +    unsigned int severity;
> +};
> +
> +static const struct MC_derror_table mc_derror_table[] = {
> +{ 0x00008000, false,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00004000, true,
> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000800, true,
> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000400, true,
> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000080, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0x00000100, true,
> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> +{ 0, false, 0, 0, 0, 0 } };
> +
> +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
> +
>  typedef enum EventClass {
>      EVENT_CLASS_INTERNAL_ERRORS     = 0,
>      EVENT_CLASS_EPOW                = 1,
> @@ -620,6 +720,147 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
>                              RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
>  }
>  
> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
> +{
> +    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
> +    return RTAS_ERROR_LOG_MAX;
> +}
> +
> +static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
> +                                        struct mc_extended_log *ext_elog)
> +{
> +    int i;
> +    CPUPPCState *env = &cpu->env;
> +    uint32_t summary;
> +    uint64_t dsisr = env->spr[SPR_DSISR];
> +
> +    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
> +    if (recovered) {
> +        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
> +    } else {
> +        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
> +    }
> +
> +    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
> +        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
> +            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
> +                continue;
> +            }
> +
> +            ext_elog->mc.error_type = mc_derror_table[i].error_type;
> +            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
> +            if (mc_derror_table[i].dar_valid) {
> +                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
> +            }
> +
> +            summary |= mc_derror_table[i].initiator
> +                        | mc_derror_table[i].severity;
> +
> +            return summary;
> +        }
> +    } else {
> +        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
> +            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
> +                    mc_ierror_table[i].srr1_value) {
> +                continue;
> +            }
> +
> +            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
> +            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
> +            if (mc_ierror_table[i].nip_valid) {
> +                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
> +            }
> +
> +            summary |= mc_ierror_table[i].initiator
> +                        | mc_ierror_table[i].severity;
> +
> +            return summary;
> +        }
> +    }
> +
> +    summary |= RTAS_LOG_INITIATOR_CPU;
> +    return summary;
> +}
> +
> +static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> +{
> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> +    CPUState *cs = CPU(cpu);
> +    uint64_t rtas_addr;
> +    CPUPPCState *env = &cpu->env;
> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +    target_ulong r3, msr = 0;
> +    struct rtas_error_log log;
> +    struct mc_extended_log *ext_elog;
> +    uint32_t summary;
> +
> +    /*
> +     * Properly set bits in MSR before we invoke the handler.
> +     * SRR0/1, DAR and DSISR are properly set by KVM
> +     */
> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
> +        msr |= (1ULL << MSR_LE);
> +    }
> +
> +    if (env->msr && (1ULL << MSR_SF)) {
> +        msr |= (1ULL << MSR_SF);
> +    }
> +
> +    msr |= (1ULL << MSR_ME);
> +
> +    if (spapr->guest_machine_check_addr == -1) {

Should be (target_ulong) -1

> +        /*
> +         * This implies that we have hit a machine check between system
> +         * reset and "ibm,nmi-register". Fall back to the old machine
> +         * check behavior in such cases.
> +         */
> +        env->spr[SPR_SRR0] = env->nip;
> +        env->spr[SPR_SRR1] = env->msr;
> +        env->msr = msr;
> +        env->nip = 0x200;
> +        return;
> +    }
> +
> +    ext_elog = g_malloc0(sizeof(struct mc_extended_log));

sizeof(*ext_elog) is preferable IMHO, same remark for the other sizeof sites.

Also, I can't find the corresponding call to g_free(), which should be
somewhere in this function IIUC.

> +    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
> +
> +    log.summary = cpu_to_be32(summary);
> +    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
> +
> +    /* r3 should be in BE always */
> +    r3 = cpu_to_be64(env->gpr[3]);
> +    env->msr = msr;
> +
> +    spapr_init_v6hdr(&ext_elog->v6hdr);
> +    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
> +    ext_elog->mc.hdr.section_length =
> +                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
> +    ext_elog->mc.hdr.section_version = 1;
> +
> +    /* get rtas addr from fdt */
> +    rtas_addr = spapr_get_rtas_addr();
> +    if (!rtas_addr) {
> +        /* Unable to fetch rtas_addr. Hence reset the guest */
> +        ppc_cpu_do_system_reset(cs);
> +    }
> +
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
> +                              &log, sizeof(log));
> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
> +                              sizeof(log), ext_elog,
> +                              sizeof(struct mc_extended_log));
> +
> +    /* Save gpr[3] in the guest endian mode */
> +    if ((*pcc->interrupts_big_endian)(cpu)) {
> +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
> +    } else {
> +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
> +    }
> +
> +    env->nip = spapr->guest_machine_check_addr;
> +}
> +
>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>  {
>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> @@ -640,6 +881,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>          }
>      }
>      spapr->mc_status = cpu->vcpu_id;
> +
> +    spapr_mce_dispatch_elog(cpu, recovered);
> +
> +    return;
>  }
>  
>  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index f7204d0..03f34bf 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -661,6 +661,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
>  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
>  
> +/* Offset from rtas-base where error log is placed */
> +#define RTAS_ERRLOG_OFFSET       0x25
> +

We already have an RTAS_ERROR_LOG_MAX macro defined in this file.
Maybe use the same "ERROR_LOG" wording for consistency.

>  static inline uint64_t ppc64_phys_to_real(uint64_t addr)
>  {
>      return addr & ~0xF000000000000000ULL;
> @@ -798,6 +801,7 @@ int spapr_max_server_number(SpaprMachineState *spapr);
>  void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
>                        uint64_t pte0, uint64_t pte1);
>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
>  
>  /* DRC callbacks. */
>  void spapr_core_release(DeviceState *dev);
> 
>
David Gibson May 14, 2019, 12:08 a.m. UTC | #8
On Mon, May 13, 2019 at 01:30:53PM +0200, Greg Kurz wrote:
> On Mon, 22 Apr 2019 12:33:26 +0530
> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> 
> > Upon a machine check exception (MCE) in a guest address space,
> > KVM causes a guest exit to enable QEMU to build and pass the
> > error to the guest in the PAPR defined rtas error log format.
> > 
> > This patch builds the rtas error log, copies it to the rtas_addr
> > and then invokes the guest registered machine check handler. The
> > handler in the guest takes suitable action(s) depending on the type
> > and criticality of the error. For example, if an error is
> > unrecoverable memory corruption in an application inside the
> > guest, then the guest kernel sends a SIGBUS to the application.
> > For recoverable errors, the guest performs recovery actions and
> > logs the error.
> > 
> > Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> > ---
> >  hw/ppc/spapr.c         |    4 +
> >  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/hw/ppc/spapr.h |    4 +
> >  3 files changed, 253 insertions(+)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 2779efe..ffd1715 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
> >          error_report("Could not get size of LPAR rtas '%s'", filename);
> >          exit(1);
> >      }
> > +
> > +    /* Resize blob to accommodate error log. */
> > +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
> > +
> 
> This is the only user for spapr_get_rtas_size(), which is trivial.
> I suggest you simply open-code it here.

I agree.

> But also, spapr->rtas_size is a guest visible thing, "rtas-size" prop in the
> DT. Since existing machine types don't do that, I guess we should only use
> the new size if cap-fwnmi-mce=on for the sake of compatibility.

Yes, that's a good idea.  Changing this is very unlikely to break a
guest, but it's easy to be safe here so let's do it.

> 
> >      spapr->rtas_blob = g_malloc(spapr->rtas_size);
> >      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
> >          error_report("Could not load LPAR rtas '%s'", filename);
> > diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> > index 9922a23..4032db0 100644
> > --- a/hw/ppc/spapr_events.c
> > +++ b/hw/ppc/spapr_events.c
> > @@ -212,6 +212,106 @@ struct hp_extended_log {
> >      struct rtas_event_log_v6_hp hp;
> >  } QEMU_PACKED;
> >  
> > +struct rtas_event_log_v6_mc {
> 
> Even if the rest of the code in this file seems to ignore CODING_STYLE,
> maybe it's time to start using CamelCase.
> 
> David ?

Out of scope here, I think.

> > +#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
> > +    struct rtas_event_log_v6_section_header hdr;
> > +    uint32_t fru_id;
> > +    uint32_t proc_id;
> > +    uint8_t error_type;
> > +#define RTAS_LOG_V6_MC_TYPE_UE                           0
> > +#define RTAS_LOG_V6_MC_TYPE_SLB                          1
> > +#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
> > +#define RTAS_LOG_V6_MC_TYPE_TLB                          4
> > +#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
> > +#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
> > +    uint8_t sub_err_type;
> > +#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
> > +#define RTAS_LOG_V6_MC_UE_IFETCH                         1
> > +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
> > +#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
> > +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
> > +#define RTAS_LOG_V6_MC_SLB_PARITY                        0
> > +#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
> > +#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
> > +#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
> > +#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
> > +#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
> > +#define RTAS_LOG_V6_MC_TLB_PARITY                        1
> > +#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
> > +#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
> > +    uint8_t reserved_1[6];
> > +    uint64_t effective_address;
> > +    uint64_t logical_address;
> > +} QEMU_PACKED;
> > +
> > +struct mc_extended_log {
> > +    struct rtas_event_log_v6 v6hdr;
> > +    struct rtas_event_log_v6_mc mc;
> > +} QEMU_PACKED;
> > +
> > +struct MC_ierror_table {
> > +    unsigned long srr1_mask;
> > +    unsigned long srr1_value;
> > +    bool nip_valid; /* nip is a valid indicator of faulting address */
> > +    uint8_t error_type;
> > +    uint8_t error_subtype;
> > +    unsigned int initiator;
> > +    unsigned int severity;
> > +};
> > +
> > +static const struct MC_ierror_table mc_ierror_table[] = {
> > +{ 0x00000000081c0000, 0x0000000000040000, true,
> > +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000000081c0000, 0x0000000000080000, true,
> > +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000000081c0000, 0x00000000000c0000, true,
> > +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000000081c0000, 0x0000000000100000, true,
> > +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000000081c0000, 0x0000000000140000, true,
> > +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000000081c0000, 0x0000000000180000, true,
> > +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0, 0, 0, 0, 0, 0 } };
> > +
> > +struct MC_derror_table {
> > +    unsigned long dsisr_value;
> > +    bool dar_valid; /* dar is a valid indicator of faulting address */
> > +    uint8_t error_type;
> > +    uint8_t error_subtype;
> > +    unsigned int initiator;
> > +    unsigned int severity;
> > +};
> > +
> > +static const struct MC_derror_table mc_derror_table[] = {
> > +{ 0x00008000, false,
> > +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00004000, true,
> > +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000800, true,
> > +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000400, true,
> > +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000080, true,
> > +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0x00000100, true,
> > +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
> > +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
> > +{ 0, false, 0, 0, 0, 0 } };
> > +
> > +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
> > +
> >  typedef enum EventClass {
> >      EVENT_CLASS_INTERNAL_ERRORS     = 0,
> >      EVENT_CLASS_EPOW                = 1,
> > @@ -620,6 +720,147 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
> >                              RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
> >  }
> >  
> > +ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
> > +{
> > +    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
> > +    return RTAS_ERROR_LOG_MAX;
> > +}
> > +
> > +static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
> > +                                        struct mc_extended_log *ext_elog)
> > +{
> > +    int i;
> > +    CPUPPCState *env = &cpu->env;
> > +    uint32_t summary;
> > +    uint64_t dsisr = env->spr[SPR_DSISR];
> > +
> > +    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
> > +    if (recovered) {
> > +        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
> > +    } else {
> > +        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
> > +    }
> > +
> > +    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
> > +        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
> > +            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
> > +                continue;
> > +            }
> > +
> > +            ext_elog->mc.error_type = mc_derror_table[i].error_type;
> > +            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
> > +            if (mc_derror_table[i].dar_valid) {
> > +                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
> > +            }
> > +
> > +            summary |= mc_derror_table[i].initiator
> > +                        | mc_derror_table[i].severity;
> > +
> > +            return summary;
> > +        }
> > +    } else {
> > +        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
> > +            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
> > +                    mc_ierror_table[i].srr1_value) {
> > +                continue;
> > +            }
> > +
> > +            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
> > +            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
> > +            if (mc_ierror_table[i].nip_valid) {
> > +                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
> > +            }
> > +
> > +            summary |= mc_ierror_table[i].initiator
> > +                        | mc_ierror_table[i].severity;
> > +
> > +            return summary;
> > +        }
> > +    }
> > +
> > +    summary |= RTAS_LOG_INITIATOR_CPU;
> > +    return summary;
> > +}
> > +
> > +static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
> > +{
> > +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > +    CPUState *cs = CPU(cpu);
> > +    uint64_t rtas_addr;
> > +    CPUPPCState *env = &cpu->env;
> > +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> > +    target_ulong r3, msr = 0;
> > +    struct rtas_error_log log;
> > +    struct mc_extended_log *ext_elog;
> > +    uint32_t summary;
> > +
> > +    /*
> > +     * Properly set bits in MSR before we invoke the handler.
> > +     * SRR0/1, DAR and DSISR are properly set by KVM
> > +     */
> > +    if (!(*pcc->interrupts_big_endian)(cpu)) {
> > +        msr |= (1ULL << MSR_LE);
> > +    }
> > +
> > +    if (env->msr && (1ULL << MSR_SF)) {
> > +        msr |= (1ULL << MSR_SF);
> > +    }
> > +
> > +    msr |= (1ULL << MSR_ME);
> > +
> > +    if (spapr->guest_machine_check_addr == -1) {
> 
> Should be (target_ulong) -1

I think the == itself should perform the necessary coercion.

> > +        /*
> > +         * This implies that we have hit a machine check between system
> > +         * reset and "ibm,nmi-register". Fall back to the old machine
> > +         * check behavior in such cases.
> > +         */
> > +        env->spr[SPR_SRR0] = env->nip;
> > +        env->spr[SPR_SRR1] = env->msr;
> > +        env->msr = msr;
> > +        env->nip = 0x200;
> > +        return;
> > +    }
> > +
> > +    ext_elog = g_malloc0(sizeof(struct mc_extended_log));
> 
> sizeof(*ext_elog) is preferable IMHO, same remark for the other sizeof sites.

Agreed.

> Also, I can't find the corresponding call to g_free(), which should be
> somewhere in this function IIUC.

Yes, that needs fixing.

> 
> > +    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
> > +
> > +    log.summary = cpu_to_be32(summary);
> > +    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
> > +
> > +    /* r3 should be in BE always */
> > +    r3 = cpu_to_be64(env->gpr[3]);
> > +    env->msr = msr;
> > +
> > +    spapr_init_v6hdr(&ext_elog->v6hdr);
> > +    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
> > +    ext_elog->mc.hdr.section_length =
> > +                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
> > +    ext_elog->mc.hdr.section_version = 1;
> > +
> > +    /* get rtas addr from fdt */
> > +    rtas_addr = spapr_get_rtas_addr();
> > +    if (!rtas_addr) {
> > +        /* Unable to fetch rtas_addr. Hence reset the guest */
> > +        ppc_cpu_do_system_reset(cs);
> > +    }
> > +
> > +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
> > +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
> > +                              &log, sizeof(log));
> > +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
> > +                              sizeof(log), ext_elog,
> > +                              sizeof(struct mc_extended_log));
> > +
> > +    /* Save gpr[3] in the guest endian mode */
> > +    if ((*pcc->interrupts_big_endian)(cpu)) {
> > +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
> > +    } else {
> > +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
> > +    }
> > +
> > +    env->nip = spapr->guest_machine_check_addr;
> > +}
> > +
> >  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >  {
> >      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
> > @@ -640,6 +881,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
> >          }
> >      }
> >      spapr->mc_status = cpu->vcpu_id;
> > +
> > +    spapr_mce_dispatch_elog(cpu, recovered);
> > +
> > +    return;
> >  }
> >  
> >  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
> > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > index f7204d0..03f34bf 100644
> > --- a/include/hw/ppc/spapr.h
> > +++ b/include/hw/ppc/spapr.h
> > @@ -661,6 +661,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
> >  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
> >  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
> >  
> > +/* Offset from rtas-base where error log is placed */
> > +#define RTAS_ERRLOG_OFFSET       0x25
> > +
> 
> We already have an RTAS_ERROR_LOG_MAX macro defined in this file.
> Maybe use the same "ERROR_LOG" wording for consistency.

Agreed.

> >  static inline uint64_t ppc64_phys_to_real(uint64_t addr)
> >  {
> >      return addr & ~0xF000000000000000ULL;
> > @@ -798,6 +801,7 @@ int spapr_max_server_number(SpaprMachineState *spapr);
> >  void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
> >                        uint64_t pte0, uint64_t pte1);
> >  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
> > +ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
> >  
> >  /* DRC callbacks. */
> >  void spapr_core_release(DeviceState *dev);
> > 
> > 
>
Aravinda Prasad May 14, 2019, 4:26 a.m. UTC | #9
On Tuesday 14 May 2019 05:38 AM, David Gibson wrote:
> On Mon, May 13, 2019 at 01:30:53PM +0200, Greg Kurz wrote:
>> On Mon, 22 Apr 2019 12:33:26 +0530
>> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
>>
>>> Upon a machine check exception (MCE) in a guest address space,
>>> KVM causes a guest exit to enable QEMU to build and pass the
>>> error to the guest in the PAPR defined rtas error log format.
>>>
>>> This patch builds the rtas error log, copies it to the rtas_addr
>>> and then invokes the guest registered machine check handler. The
>>> handler in the guest takes suitable action(s) depending on the type
>>> and criticality of the error. For example, if an error is
>>> unrecoverable memory corruption in an application inside the
>>> guest, then the guest kernel sends a SIGBUS to the application.
>>> For recoverable errors, the guest performs recovery actions and
>>> logs the error.
>>>
>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>>> ---
>>>  hw/ppc/spapr.c         |    4 +
>>>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>  include/hw/ppc/spapr.h |    4 +
>>>  3 files changed, 253 insertions(+)
>>>
>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>> index 2779efe..ffd1715 100644
>>> --- a/hw/ppc/spapr.c
>>> +++ b/hw/ppc/spapr.c
>>> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
>>>          error_report("Could not get size of LPAR rtas '%s'", filename);
>>>          exit(1);
>>>      }
>>> +
>>> +    /* Resize blob to accommodate error log. */
>>> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
>>> +
>>
>> This is the only user for spapr_get_rtas_size(), which is trivial.
>> I suggest you simply open-code it here.
> 
> I agree.

Sure.

> 
>> But also, spapr->rtas_size is a guest visible thing, "rtas-size" prop in the
>> DT. Since existing machine types don't do that, I guess we should only use
>> the new size if cap-fwnmi-mce=on for the sake of compatibility.
> 
> Yes, that's a good idea.  Changing this is very unlikely to break a
> guest, but it's easy to be safe here so let's do it.

I did it like that because the rtas_blob is allocated based on rtas_size
in spapr_machine_init(). During spapr_machine_init() it is not know if
the guest calls "ibm, nmi-register". So if we want to use the new size
only when cap_fwnmi=on, then we have to realloc the blob in "ibm,
nmi-register".


> 
>>
>>>      spapr->rtas_blob = g_malloc(spapr->rtas_size);
>>>      if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
>>>          error_report("Could not load LPAR rtas '%s'", filename);
>>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>>> index 9922a23..4032db0 100644
>>> --- a/hw/ppc/spapr_events.c
>>> +++ b/hw/ppc/spapr_events.c
>>> @@ -212,6 +212,106 @@ struct hp_extended_log {
>>>      struct rtas_event_log_v6_hp hp;
>>>  } QEMU_PACKED;
>>>  
>>> +struct rtas_event_log_v6_mc {
>>
>> Even if the rest of the code in this file seems to ignore CODING_STYLE,
>> maybe it's time to start using CamelCase.
>>
>> David ?
> 
> Out of scope here, I think.
> 
>>> +#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
>>> +    struct rtas_event_log_v6_section_header hdr;
>>> +    uint32_t fru_id;
>>> +    uint32_t proc_id;
>>> +    uint8_t error_type;
>>> +#define RTAS_LOG_V6_MC_TYPE_UE                           0
>>> +#define RTAS_LOG_V6_MC_TYPE_SLB                          1
>>> +#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
>>> +#define RTAS_LOG_V6_MC_TYPE_TLB                          4
>>> +#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
>>> +#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
>>> +    uint8_t sub_err_type;
>>> +#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
>>> +#define RTAS_LOG_V6_MC_UE_IFETCH                         1
>>> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
>>> +#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
>>> +#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
>>> +#define RTAS_LOG_V6_MC_SLB_PARITY                        0
>>> +#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
>>> +#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
>>> +#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
>>> +#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
>>> +#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
>>> +#define RTAS_LOG_V6_MC_TLB_PARITY                        1
>>> +#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
>>> +#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
>>> +    uint8_t reserved_1[6];
>>> +    uint64_t effective_address;
>>> +    uint64_t logical_address;
>>> +} QEMU_PACKED;
>>> +
>>> +struct mc_extended_log {
>>> +    struct rtas_event_log_v6 v6hdr;
>>> +    struct rtas_event_log_v6_mc mc;
>>> +} QEMU_PACKED;
>>> +
>>> +struct MC_ierror_table {
>>> +    unsigned long srr1_mask;
>>> +    unsigned long srr1_value;
>>> +    bool nip_valid; /* nip is a valid indicator of faulting address */
>>> +    uint8_t error_type;
>>> +    uint8_t error_subtype;
>>> +    unsigned int initiator;
>>> +    unsigned int severity;
>>> +};
>>> +
>>> +static const struct MC_ierror_table mc_ierror_table[] = {
>>> +{ 0x00000000081c0000, 0x0000000000040000, true,
>>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000000081c0000, 0x0000000000080000, true,
>>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000000081c0000, 0x00000000000c0000, true,
>>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000000081c0000, 0x0000000000100000, true,
>>> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000000081c0000, 0x0000000000140000, true,
>>> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000000081c0000, 0x0000000000180000, true,
>>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0, 0, 0, 0, 0, 0 } };
>>> +
>>> +struct MC_derror_table {
>>> +    unsigned long dsisr_value;
>>> +    bool dar_valid; /* dar is a valid indicator of faulting address */
>>> +    uint8_t error_type;
>>> +    uint8_t error_subtype;
>>> +    unsigned int initiator;
>>> +    unsigned int severity;
>>> +};
>>> +
>>> +static const struct MC_derror_table mc_derror_table[] = {
>>> +{ 0x00008000, false,
>>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00004000, true,
>>> +  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000800, true,
>>> +  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000400, true,
>>> +  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000080, true,
>>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0x00000100, true,
>>> +  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
>>> +  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
>>> +{ 0, false, 0, 0, 0, 0 } };
>>> +
>>> +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
>>> +
>>>  typedef enum EventClass {
>>>      EVENT_CLASS_INTERNAL_ERRORS     = 0,
>>>      EVENT_CLASS_EPOW                = 1,
>>> @@ -620,6 +720,147 @@ void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
>>>                              RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
>>>  }
>>>  
>>> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
>>> +{
>>> +    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
>>> +    return RTAS_ERROR_LOG_MAX;
>>> +}
>>> +
>>> +static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
>>> +                                        struct mc_extended_log *ext_elog)
>>> +{
>>> +    int i;
>>> +    CPUPPCState *env = &cpu->env;
>>> +    uint32_t summary;
>>> +    uint64_t dsisr = env->spr[SPR_DSISR];
>>> +
>>> +    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
>>> +    if (recovered) {
>>> +        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
>>> +    } else {
>>> +        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
>>> +    }
>>> +
>>> +    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
>>> +        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
>>> +            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
>>> +                continue;
>>> +            }
>>> +
>>> +            ext_elog->mc.error_type = mc_derror_table[i].error_type;
>>> +            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
>>> +            if (mc_derror_table[i].dar_valid) {
>>> +                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
>>> +            }
>>> +
>>> +            summary |= mc_derror_table[i].initiator
>>> +                        | mc_derror_table[i].severity;
>>> +
>>> +            return summary;
>>> +        }
>>> +    } else {
>>> +        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
>>> +            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
>>> +                    mc_ierror_table[i].srr1_value) {
>>> +                continue;
>>> +            }
>>> +
>>> +            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
>>> +            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
>>> +            if (mc_ierror_table[i].nip_valid) {
>>> +                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
>>> +            }
>>> +
>>> +            summary |= mc_ierror_table[i].initiator
>>> +                        | mc_ierror_table[i].severity;
>>> +
>>> +            return summary;
>>> +        }
>>> +    }
>>> +
>>> +    summary |= RTAS_LOG_INITIATOR_CPU;
>>> +    return summary;
>>> +}
>>> +
>>> +static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
>>> +{
>>> +    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>> +    CPUState *cs = CPU(cpu);
>>> +    uint64_t rtas_addr;
>>> +    CPUPPCState *env = &cpu->env;
>>> +    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
>>> +    target_ulong r3, msr = 0;
>>> +    struct rtas_error_log log;
>>> +    struct mc_extended_log *ext_elog;
>>> +    uint32_t summary;
>>> +
>>> +    /*
>>> +     * Properly set bits in MSR before we invoke the handler.
>>> +     * SRR0/1, DAR and DSISR are properly set by KVM
>>> +     */
>>> +    if (!(*pcc->interrupts_big_endian)(cpu)) {
>>> +        msr |= (1ULL << MSR_LE);
>>> +    }
>>> +
>>> +    if (env->msr && (1ULL << MSR_SF)) {
>>> +        msr |= (1ULL << MSR_SF);
>>> +    }
>>> +
>>> +    msr |= (1ULL << MSR_ME);
>>> +
>>> +    if (spapr->guest_machine_check_addr == -1) {
>>
>> Should be (target_ulong) -1
> 
> I think the == itself should perform the necessary coercion.
> 
>>> +        /*
>>> +         * This implies that we have hit a machine check between system
>>> +         * reset and "ibm,nmi-register". Fall back to the old machine
>>> +         * check behavior in such cases.
>>> +         */
>>> +        env->spr[SPR_SRR0] = env->nip;
>>> +        env->spr[SPR_SRR1] = env->msr;
>>> +        env->msr = msr;
>>> +        env->nip = 0x200;
>>> +        return;
>>> +    }
>>> +
>>> +    ext_elog = g_malloc0(sizeof(struct mc_extended_log));
>>
>> sizeof(*ext_elog) is preferable IMHO, same remark for the other sizeof sites.
> 
> Agreed.

ok.

> 
>> Also, I can't find the corresponding call to g_free(), which should be
>> somewhere in this function IIUC.
> 
> Yes, that needs fixing.

Yes, missed calling free. Will fix it.

> 
>>
>>> +    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
>>> +
>>> +    log.summary = cpu_to_be32(summary);
>>> +    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
>>> +
>>> +    /* r3 should be in BE always */
>>> +    r3 = cpu_to_be64(env->gpr[3]);
>>> +    env->msr = msr;
>>> +
>>> +    spapr_init_v6hdr(&ext_elog->v6hdr);
>>> +    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
>>> +    ext_elog->mc.hdr.section_length =
>>> +                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
>>> +    ext_elog->mc.hdr.section_version = 1;
>>> +
>>> +    /* get rtas addr from fdt */
>>> +    rtas_addr = spapr_get_rtas_addr();
>>> +    if (!rtas_addr) {
>>> +        /* Unable to fetch rtas_addr. Hence reset the guest */
>>> +        ppc_cpu_do_system_reset(cs);
>>> +    }
>>> +
>>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
>>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
>>> +                              &log, sizeof(log));
>>> +    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
>>> +                              sizeof(log), ext_elog,
>>> +                              sizeof(struct mc_extended_log));
>>> +
>>> +    /* Save gpr[3] in the guest endian mode */
>>> +    if ((*pcc->interrupts_big_endian)(cpu)) {
>>> +        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
>>> +    } else {
>>> +        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
>>> +    }
>>> +
>>> +    env->nip = spapr->guest_machine_check_addr;
>>> +}
>>> +
>>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>>  {
>>>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>> @@ -640,6 +881,10 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>>          }
>>>      }
>>>      spapr->mc_status = cpu->vcpu_id;
>>> +
>>> +    spapr_mce_dispatch_elog(cpu, recovered);
>>> +
>>> +    return;
>>>  }
>>>  
>>>  static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>> index f7204d0..03f34bf 100644
>>> --- a/include/hw/ppc/spapr.h
>>> +++ b/include/hw/ppc/spapr.h
>>> @@ -661,6 +661,9 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>>>  #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
>>>  #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
>>>  
>>> +/* Offset from rtas-base where error log is placed */
>>> +#define RTAS_ERRLOG_OFFSET       0x25
>>> +
>>
>> We already have an RTAS_ERROR_LOG_MAX macro defined in this file.
>> Maybe use the same "ERROR_LOG" wording for consistency.
> 
> Agreed.

ok.

Regards,
Aravinda

> 
>>>  static inline uint64_t ppc64_phys_to_real(uint64_t addr)
>>>  {
>>>      return addr & ~0xF000000000000000ULL;
>>> @@ -798,6 +801,7 @@ int spapr_max_server_number(SpaprMachineState *spapr);
>>>  void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
>>>                        uint64_t pte0, uint64_t pte1);
>>>  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
>>> +ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
>>>  
>>>  /* DRC callbacks. */
>>>  void spapr_core_release(DeviceState *dev);
>>>
>>>
>>
>
David Gibson May 14, 2019, 4:40 a.m. UTC | #10
On Tue, May 14, 2019 at 09:56:41AM +0530, Aravinda Prasad wrote:
> 
> 
> On Tuesday 14 May 2019 05:38 AM, David Gibson wrote:
> > On Mon, May 13, 2019 at 01:30:53PM +0200, Greg Kurz wrote:
> >> On Mon, 22 Apr 2019 12:33:26 +0530
> >> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> >>
> >>> Upon a machine check exception (MCE) in a guest address space,
> >>> KVM causes a guest exit to enable QEMU to build and pass the
> >>> error to the guest in the PAPR defined rtas error log format.
> >>>
> >>> This patch builds the rtas error log, copies it to the rtas_addr
> >>> and then invokes the guest registered machine check handler. The
> >>> handler in the guest takes suitable action(s) depending on the type
> >>> and criticality of the error. For example, if an error is
> >>> unrecoverable memory corruption in an application inside the
> >>> guest, then the guest kernel sends a SIGBUS to the application.
> >>> For recoverable errors, the guest performs recovery actions and
> >>> logs the error.
> >>>
> >>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >>> ---
> >>>  hw/ppc/spapr.c         |    4 +
> >>>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>>  include/hw/ppc/spapr.h |    4 +
> >>>  3 files changed, 253 insertions(+)
> >>>
> >>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>> index 2779efe..ffd1715 100644
> >>> --- a/hw/ppc/spapr.c
> >>> +++ b/hw/ppc/spapr.c
> >>> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
> >>>          error_report("Could not get size of LPAR rtas '%s'", filename);
> >>>          exit(1);
> >>>      }
> >>> +
> >>> +    /* Resize blob to accommodate error log. */
> >>> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
> >>> +
> >>
> >> This is the only user for spapr_get_rtas_size(), which is trivial.
> >> I suggest you simply open-code it here.
> > 
> > I agree.
> 
> Sure.
> 
> > 
> >> But also, spapr->rtas_size is a guest visible thing, "rtas-size" prop in the
> >> DT. Since existing machine types don't do that, I guess we should only use
> >> the new size if cap-fwnmi-mce=on for the sake of compatibility.
> > 
> > Yes, that's a good idea.  Changing this is very unlikely to break a
> > guest, but it's easy to be safe here so let's do it.
> 
> I did it like that because the rtas_blob is allocated based on rtas_size
> in spapr_machine_init(). During spapr_machine_init() it is not know if
> the guest calls "ibm, nmi-register". So if we want to use the new size
> only when cap_fwnmi=on, then we have to realloc the blob in "ibm,
> nmi-register".

What?  Just always allocate the necessary space in
spapr_machine_init() if cap_fwnmi=on, it'll be wasted if
ibm,nmi-register is never called, but it's not that much space so we
don't really care.
Aravinda Prasad May 14, 2019, 5:06 a.m. UTC | #11
On Tuesday 14 May 2019 10:10 AM, David Gibson wrote:
> On Tue, May 14, 2019 at 09:56:41AM +0530, Aravinda Prasad wrote:
>>
>>
>> On Tuesday 14 May 2019 05:38 AM, David Gibson wrote:
>>> On Mon, May 13, 2019 at 01:30:53PM +0200, Greg Kurz wrote:
>>>> On Mon, 22 Apr 2019 12:33:26 +0530
>>>> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
>>>>
>>>>> Upon a machine check exception (MCE) in a guest address space,
>>>>> KVM causes a guest exit to enable QEMU to build and pass the
>>>>> error to the guest in the PAPR defined rtas error log format.
>>>>>
>>>>> This patch builds the rtas error log, copies it to the rtas_addr
>>>>> and then invokes the guest registered machine check handler. The
>>>>> handler in the guest takes suitable action(s) depending on the type
>>>>> and criticality of the error. For example, if an error is
>>>>> unrecoverable memory corruption in an application inside the
>>>>> guest, then the guest kernel sends a SIGBUS to the application.
>>>>> For recoverable errors, the guest performs recovery actions and
>>>>> logs the error.
>>>>>
>>>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>>>>> ---
>>>>>  hw/ppc/spapr.c         |    4 +
>>>>>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  include/hw/ppc/spapr.h |    4 +
>>>>>  3 files changed, 253 insertions(+)
>>>>>
>>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>>> index 2779efe..ffd1715 100644
>>>>> --- a/hw/ppc/spapr.c
>>>>> +++ b/hw/ppc/spapr.c
>>>>> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
>>>>>          error_report("Could not get size of LPAR rtas '%s'", filename);
>>>>>          exit(1);
>>>>>      }
>>>>> +
>>>>> +    /* Resize blob to accommodate error log. */
>>>>> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
>>>>> +
>>>>
>>>> This is the only user for spapr_get_rtas_size(), which is trivial.
>>>> I suggest you simply open-code it here.
>>>
>>> I agree.
>>
>> Sure.
>>
>>>
>>>> But also, spapr->rtas_size is a guest visible thing, "rtas-size" prop in the
>>>> DT. Since existing machine types don't do that, I guess we should only use
>>>> the new size if cap-fwnmi-mce=on for the sake of compatibility.
>>>
>>> Yes, that's a good idea.  Changing this is very unlikely to break a
>>> guest, but it's easy to be safe here so let's do it.
>>
>> I did it like that because the rtas_blob is allocated based on rtas_size
>> in spapr_machine_init(). During spapr_machine_init() it is not know if
>> the guest calls "ibm, nmi-register". So if we want to use the new size
>> only when cap_fwnmi=on, then we have to realloc the blob in "ibm,
>> nmi-register".
> 
> What?  Just always allocate the necessary space in
> spapr_machine_init() if cap_fwnmi=on, it'll be wasted if
> ibm,nmi-register is never called, but it's not that much space so we
> don't really care.

Yes, not that much space, and ibm,nmi-register is called when the Linux
kernel boots. I guess, even though other OSes might not call
ibm,nmi-register, they do not constitute significant QEMU on Power users.

So I think, I will keep the code as is.

>
David Gibson May 16, 2019, 1:47 a.m. UTC | #12
On Tue, May 14, 2019 at 10:36:17AM +0530, Aravinda Prasad wrote:
> 
> 
> On Tuesday 14 May 2019 10:10 AM, David Gibson wrote:
> > On Tue, May 14, 2019 at 09:56:41AM +0530, Aravinda Prasad wrote:
> >>
> >>
> >> On Tuesday 14 May 2019 05:38 AM, David Gibson wrote:
> >>> On Mon, May 13, 2019 at 01:30:53PM +0200, Greg Kurz wrote:
> >>>> On Mon, 22 Apr 2019 12:33:26 +0530
> >>>> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
> >>>>
> >>>>> Upon a machine check exception (MCE) in a guest address space,
> >>>>> KVM causes a guest exit to enable QEMU to build and pass the
> >>>>> error to the guest in the PAPR defined rtas error log format.
> >>>>>
> >>>>> This patch builds the rtas error log, copies it to the rtas_addr
> >>>>> and then invokes the guest registered machine check handler. The
> >>>>> handler in the guest takes suitable action(s) depending on the type
> >>>>> and criticality of the error. For example, if an error is
> >>>>> unrecoverable memory corruption in an application inside the
> >>>>> guest, then the guest kernel sends a SIGBUS to the application.
> >>>>> For recoverable errors, the guest performs recovery actions and
> >>>>> logs the error.
> >>>>>
> >>>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >>>>> ---
> >>>>>  hw/ppc/spapr.c         |    4 +
> >>>>>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>>>>  include/hw/ppc/spapr.h |    4 +
> >>>>>  3 files changed, 253 insertions(+)
> >>>>>
> >>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >>>>> index 2779efe..ffd1715 100644
> >>>>> --- a/hw/ppc/spapr.c
> >>>>> +++ b/hw/ppc/spapr.c
> >>>>> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
> >>>>>          error_report("Could not get size of LPAR rtas '%s'", filename);
> >>>>>          exit(1);
> >>>>>      }
> >>>>> +
> >>>>> +    /* Resize blob to accommodate error log. */
> >>>>> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
> >>>>> +
> >>>>
> >>>> This is the only user for spapr_get_rtas_size(), which is trivial.
> >>>> I suggest you simply open-code it here.
> >>>
> >>> I agree.
> >>
> >> Sure.
> >>
> >>>
> >>>> But also, spapr->rtas_size is a guest visible thing, "rtas-size" prop in the
> >>>> DT. Since existing machine types don't do that, I guess we should only use
> >>>> the new size if cap-fwnmi-mce=on for the sake of compatibility.
> >>>
> >>> Yes, that's a good idea.  Changing this is very unlikely to break a
> >>> guest, but it's easy to be safe here so let's do it.
> >>
> >> I did it like that because the rtas_blob is allocated based on rtas_size
> >> in spapr_machine_init(). During spapr_machine_init() it is not know if
> >> the guest calls "ibm, nmi-register". So if we want to use the new size
> >> only when cap_fwnmi=on, then we have to realloc the blob in "ibm,
> >> nmi-register".
> > 
> > What?  Just always allocate the necessary space in
> > spapr_machine_init() if cap_fwnmi=on, it'll be wasted if
> > ibm,nmi-register is never called, but it's not that much space so we
> > don't really care.
> 
> Yes, not that much space, and ibm,nmi-register is called when the Linux
> kernel boots. I guess, even though other OSes might not call
> ibm,nmi-register, they do not constitute significant QEMU on Power users.
> 
> So I think, I will keep the code as is.

No, that's not right.  It's impractical to change the allocation
depending on whether fwnmi is currently active.  But you *can* (and
should) base the allocation on whether fwnmi is *possible* - that is,
the value of the spapr cap.
Aravinda Prasad May 16, 2019, 4:54 a.m. UTC | #13
On Thursday 16 May 2019 07:17 AM, David Gibson wrote:
> On Tue, May 14, 2019 at 10:36:17AM +0530, Aravinda Prasad wrote:
>>
>>
>> On Tuesday 14 May 2019 10:10 AM, David Gibson wrote:
>>> On Tue, May 14, 2019 at 09:56:41AM +0530, Aravinda Prasad wrote:
>>>>
>>>>
>>>> On Tuesday 14 May 2019 05:38 AM, David Gibson wrote:
>>>>> On Mon, May 13, 2019 at 01:30:53PM +0200, Greg Kurz wrote:
>>>>>> On Mon, 22 Apr 2019 12:33:26 +0530
>>>>>> Aravinda Prasad <aravinda@linux.vnet.ibm.com> wrote:
>>>>>>
>>>>>>> Upon a machine check exception (MCE) in a guest address space,
>>>>>>> KVM causes a guest exit to enable QEMU to build and pass the
>>>>>>> error to the guest in the PAPR defined rtas error log format.
>>>>>>>
>>>>>>> This patch builds the rtas error log, copies it to the rtas_addr
>>>>>>> and then invokes the guest registered machine check handler. The
>>>>>>> handler in the guest takes suitable action(s) depending on the type
>>>>>>> and criticality of the error. For example, if an error is
>>>>>>> unrecoverable memory corruption in an application inside the
>>>>>>> guest, then the guest kernel sends a SIGBUS to the application.
>>>>>>> For recoverable errors, the guest performs recovery actions and
>>>>>>> logs the error.
>>>>>>>
>>>>>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>>>>>>> ---
>>>>>>>  hw/ppc/spapr.c         |    4 +
>>>>>>>  hw/ppc/spapr_events.c  |  245 ++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>>>  include/hw/ppc/spapr.h |    4 +
>>>>>>>  3 files changed, 253 insertions(+)
>>>>>>>
>>>>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>>>>> index 2779efe..ffd1715 100644
>>>>>>> --- a/hw/ppc/spapr.c
>>>>>>> +++ b/hw/ppc/spapr.c
>>>>>>> @@ -2918,6 +2918,10 @@ static void spapr_machine_init(MachineState *machine)
>>>>>>>          error_report("Could not get size of LPAR rtas '%s'", filename);
>>>>>>>          exit(1);
>>>>>>>      }
>>>>>>> +
>>>>>>> +    /* Resize blob to accommodate error log. */
>>>>>>> +    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
>>>>>>> +
>>>>>>
>>>>>> This is the only user for spapr_get_rtas_size(), which is trivial.
>>>>>> I suggest you simply open-code it here.
>>>>>
>>>>> I agree.
>>>>
>>>> Sure.
>>>>
>>>>>
>>>>>> But also, spapr->rtas_size is a guest visible thing, "rtas-size" prop in the
>>>>>> DT. Since existing machine types don't do that, I guess we should only use
>>>>>> the new size if cap-fwnmi-mce=on for the sake of compatibility.
>>>>>
>>>>> Yes, that's a good idea.  Changing this is very unlikely to break a
>>>>> guest, but it's easy to be safe here so let's do it.
>>>>
>>>> I did it like that because the rtas_blob is allocated based on rtas_size
>>>> in spapr_machine_init(). During spapr_machine_init() it is not know if
>>>> the guest calls "ibm, nmi-register". So if we want to use the new size
>>>> only when cap_fwnmi=on, then we have to realloc the blob in "ibm,
>>>> nmi-register".
>>>
>>> What?  Just always allocate the necessary space in
>>> spapr_machine_init() if cap_fwnmi=on, it'll be wasted if
>>> ibm,nmi-register is never called, but it's not that much space so we
>>> don't really care.
>>
>> Yes, not that much space, and ibm,nmi-register is called when the Linux
>> kernel boots. I guess, even though other OSes might not call
>> ibm,nmi-register, they do not constitute significant QEMU on Power users.
>>
>> So I think, I will keep the code as is.
> 
> No, that's not right.  It's impractical to change the allocation
> depending on whether fwnmi is currently active.  But you *can* (and
> should) base the allocation on whether fwnmi is *possible* - that is,
> the value of the spapr cap.

Sure..

>
diff mbox series

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 2779efe..ffd1715 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2918,6 +2918,10 @@  static void spapr_machine_init(MachineState *machine)
         error_report("Could not get size of LPAR rtas '%s'", filename);
         exit(1);
     }
+
+    /* Resize blob to accommodate error log. */
+    spapr->rtas_size = spapr_get_rtas_size(spapr->rtas_size);
+
     spapr->rtas_blob = g_malloc(spapr->rtas_size);
     if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) {
         error_report("Could not load LPAR rtas '%s'", filename);
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 9922a23..4032db0 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -212,6 +212,106 @@  struct hp_extended_log {
     struct rtas_event_log_v6_hp hp;
 } QEMU_PACKED;
 
+struct rtas_event_log_v6_mc {
+#define RTAS_LOG_V6_SECTION_ID_MC                   0x4D43 /* MC */
+    struct rtas_event_log_v6_section_header hdr;
+    uint32_t fru_id;
+    uint32_t proc_id;
+    uint8_t error_type;
+#define RTAS_LOG_V6_MC_TYPE_UE                           0
+#define RTAS_LOG_V6_MC_TYPE_SLB                          1
+#define RTAS_LOG_V6_MC_TYPE_ERAT                         2
+#define RTAS_LOG_V6_MC_TYPE_TLB                          4
+#define RTAS_LOG_V6_MC_TYPE_D_CACHE                      5
+#define RTAS_LOG_V6_MC_TYPE_I_CACHE                      7
+    uint8_t sub_err_type;
+#define RTAS_LOG_V6_MC_UE_INDETERMINATE                  0
+#define RTAS_LOG_V6_MC_UE_IFETCH                         1
+#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH         2
+#define RTAS_LOG_V6_MC_UE_LOAD_STORE                     3
+#define RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE     4
+#define RTAS_LOG_V6_MC_SLB_PARITY                        0
+#define RTAS_LOG_V6_MC_SLB_MULTIHIT                      1
+#define RTAS_LOG_V6_MC_SLB_INDETERMINATE                 2
+#define RTAS_LOG_V6_MC_ERAT_PARITY                       1
+#define RTAS_LOG_V6_MC_ERAT_MULTIHIT                     2
+#define RTAS_LOG_V6_MC_ERAT_INDETERMINATE                3
+#define RTAS_LOG_V6_MC_TLB_PARITY                        1
+#define RTAS_LOG_V6_MC_TLB_MULTIHIT                      2
+#define RTAS_LOG_V6_MC_TLB_INDETERMINATE                 3
+    uint8_t reserved_1[6];
+    uint64_t effective_address;
+    uint64_t logical_address;
+} QEMU_PACKED;
+
+struct mc_extended_log {
+    struct rtas_event_log_v6 v6hdr;
+    struct rtas_event_log_v6_mc mc;
+} QEMU_PACKED;
+
+struct MC_ierror_table {
+    unsigned long srr1_mask;
+    unsigned long srr1_value;
+    bool nip_valid; /* nip is a valid indicator of faulting address */
+    uint8_t error_type;
+    uint8_t error_subtype;
+    unsigned int initiator;
+    unsigned int severity;
+};
+
+static const struct MC_ierror_table mc_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_IFETCH,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_IFETCH,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+struct MC_derror_table {
+    unsigned long dsisr_value;
+    bool dar_valid; /* dar is a valid indicator of faulting address */
+    uint8_t error_type;
+    uint8_t error_subtype;
+    unsigned int initiator;
+    unsigned int severity;
+};
+
+static const struct MC_derror_table mc_derror_table[] = {
+{ 0x00008000, false,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_LOAD_STORE,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00004000, true,
+  RTAS_LOG_V6_MC_TYPE_UE, RTAS_LOG_V6_MC_UE_PAGE_TABLE_WALK_LOAD_STORE,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000800, true,
+  RTAS_LOG_V6_MC_TYPE_ERAT, RTAS_LOG_V6_MC_ERAT_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000400, true,
+  RTAS_LOG_V6_MC_TYPE_TLB, RTAS_LOG_V6_MC_TLB_MULTIHIT,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000080, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_MULTIHIT,  /* Before PARITY */
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0x00000100, true,
+  RTAS_LOG_V6_MC_TYPE_SLB, RTAS_LOG_V6_MC_SLB_PARITY,
+  RTAS_LOG_INITIATOR_CPU, RTAS_LOG_SEVERITY_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
+
 typedef enum EventClass {
     EVENT_CLASS_INTERNAL_ERRORS     = 0,
     EVENT_CLASS_EPOW                = 1,
@@ -620,6 +720,147 @@  void spapr_hotplug_req_remove_by_count_indexed(SpaprDrcType drc_type,
                             RTAS_LOG_V6_HP_ACTION_REMOVE, drc_type, &drc_id);
 }
 
+ssize_t spapr_get_rtas_size(ssize_t old_rtas_size)
+{
+    g_assert(old_rtas_size < RTAS_ERRLOG_OFFSET);
+    return RTAS_ERROR_LOG_MAX;
+}
+
+static uint32_t spapr_mce_get_elog_type(PowerPCCPU *cpu, bool recovered,
+                                        struct mc_extended_log *ext_elog)
+{
+    int i;
+    CPUPPCState *env = &cpu->env;
+    uint32_t summary;
+    uint64_t dsisr = env->spr[SPR_DSISR];
+
+    summary = RTAS_LOG_VERSION_6 | RTAS_LOG_OPTIONAL_PART_PRESENT;
+    if (recovered) {
+        summary |= RTAS_LOG_DISPOSITION_FULLY_RECOVERED;
+    } else {
+        summary |= RTAS_LOG_DISPOSITION_NOT_RECOVERED;
+    }
+
+    if (SRR1_MC_LOADSTORE(env->spr[SPR_SRR1])) {
+        for (i = 0; mc_derror_table[i].dsisr_value; i++) {
+            if (!(dsisr & mc_derror_table[i].dsisr_value)) {
+                continue;
+            }
+
+            ext_elog->mc.error_type = mc_derror_table[i].error_type;
+            ext_elog->mc.sub_err_type = mc_derror_table[i].error_subtype;
+            if (mc_derror_table[i].dar_valid) {
+                ext_elog->mc.effective_address = cpu_to_be64(env->spr[SPR_DAR]);
+            }
+
+            summary |= mc_derror_table[i].initiator
+                        | mc_derror_table[i].severity;
+
+            return summary;
+        }
+    } else {
+        for (i = 0; mc_ierror_table[i].srr1_mask; i++) {
+            if ((env->spr[SPR_SRR1] & mc_ierror_table[i].srr1_mask) !=
+                    mc_ierror_table[i].srr1_value) {
+                continue;
+            }
+
+            ext_elog->mc.error_type = mc_ierror_table[i].error_type;
+            ext_elog->mc.sub_err_type = mc_ierror_table[i].error_subtype;
+            if (mc_ierror_table[i].nip_valid) {
+                ext_elog->mc.effective_address = cpu_to_be64(env->nip);
+            }
+
+            summary |= mc_ierror_table[i].initiator
+                        | mc_ierror_table[i].severity;
+
+            return summary;
+        }
+    }
+
+    summary |= RTAS_LOG_INITIATOR_CPU;
+    return summary;
+}
+
+static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool recovered)
+{
+    SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    CPUState *cs = CPU(cpu);
+    uint64_t rtas_addr;
+    CPUPPCState *env = &cpu->env;
+    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
+    target_ulong r3, msr = 0;
+    struct rtas_error_log log;
+    struct mc_extended_log *ext_elog;
+    uint32_t summary;
+
+    /*
+     * Properly set bits in MSR before we invoke the handler.
+     * SRR0/1, DAR and DSISR are properly set by KVM
+     */
+    if (!(*pcc->interrupts_big_endian)(cpu)) {
+        msr |= (1ULL << MSR_LE);
+    }
+
+    if (env->msr && (1ULL << MSR_SF)) {
+        msr |= (1ULL << MSR_SF);
+    }
+
+    msr |= (1ULL << MSR_ME);
+
+    if (spapr->guest_machine_check_addr == -1) {
+        /*
+         * This implies that we have hit a machine check between system
+         * reset and "ibm,nmi-register". Fall back to the old machine
+         * check behavior in such cases.
+         */
+        env->spr[SPR_SRR0] = env->nip;
+        env->spr[SPR_SRR1] = env->msr;
+        env->msr = msr;
+        env->nip = 0x200;
+        return;
+    }
+
+    ext_elog = g_malloc0(sizeof(struct mc_extended_log));
+    summary = spapr_mce_get_elog_type(cpu, recovered, ext_elog);
+
+    log.summary = cpu_to_be32(summary);
+    log.extended_length = cpu_to_be32(sizeof(struct mc_extended_log));
+
+    /* r3 should be in BE always */
+    r3 = cpu_to_be64(env->gpr[3]);
+    env->msr = msr;
+
+    spapr_init_v6hdr(&ext_elog->v6hdr);
+    ext_elog->mc.hdr.section_id = cpu_to_be16(RTAS_LOG_V6_SECTION_ID_MC);
+    ext_elog->mc.hdr.section_length =
+                    cpu_to_be16(sizeof(struct rtas_event_log_v6_mc));
+    ext_elog->mc.hdr.section_version = 1;
+
+    /* get rtas addr from fdt */
+    rtas_addr = spapr_get_rtas_addr();
+    if (!rtas_addr) {
+        /* Unable to fetch rtas_addr. Hence reset the guest */
+        ppc_cpu_do_system_reset(cs);
+    }
+
+    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET, &r3, sizeof(r3));
+    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3),
+                              &log, sizeof(log));
+    cpu_physical_memory_write(rtas_addr + RTAS_ERRLOG_OFFSET + sizeof(r3) +
+                              sizeof(log), ext_elog,
+                              sizeof(struct mc_extended_log));
+
+    /* Save gpr[3] in the guest endian mode */
+    if ((*pcc->interrupts_big_endian)(cpu)) {
+        env->gpr[3] = cpu_to_be64(rtas_addr + RTAS_ERRLOG_OFFSET);
+    } else {
+        env->gpr[3] = cpu_to_le64(rtas_addr + RTAS_ERRLOG_OFFSET);
+    }
+
+    env->nip = spapr->guest_machine_check_addr;
+}
+
 void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
 {
     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
@@ -640,6 +881,10 @@  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
         }
     }
     spapr->mc_status = cpu->vcpu_id;
+
+    spapr_mce_dispatch_elog(cpu, recovered);
+
+    return;
 }
 
 static void check_exception(PowerPCCPU *cpu, SpaprMachineState *spapr,
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index f7204d0..03f34bf 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -661,6 +661,9 @@  target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
 #define DIAGNOSTICS_RUN_MODE_IMMEDIATE 2
 #define DIAGNOSTICS_RUN_MODE_PERIODIC  3
 
+/* Offset from rtas-base where error log is placed */
+#define RTAS_ERRLOG_OFFSET       0x25
+
 static inline uint64_t ppc64_phys_to_real(uint64_t addr)
 {
     return addr & ~0xF000000000000000ULL;
@@ -798,6 +801,7 @@  int spapr_max_server_number(SpaprMachineState *spapr);
 void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
                       uint64_t pte0, uint64_t pte1);
 void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered);
+ssize_t spapr_get_rtas_size(ssize_t old_rtas_sizea);
 
 /* DRC callbacks. */
 void spapr_core_release(DeviceState *dev);