diff mbox

[v3,2/5] ppc: spapr: Handle "ibm, nmi-register" and "ibm, nmi-interlock" RTAS calls

Message ID 150287474187.9760.12052550430995757993.stgit@aravinda
State New
Headers show

Commit Message

Aravinda Prasad Aug. 16, 2017, 9:12 a.m. UTC
This patch adds support in QEMU to handle "ibm,nmi-register"
and "ibm,nmi-interlock" RTAS calls.

The machine check notification address is saved when the
OS issues "ibm,nmi-register" RTAS call.

This patch also handles the case when multiple processors
experience machine check at or about the same time by
handling "ibm,nmi-interlock" call. In such cases, as per
PAPR, subsequent processors serialize waiting for the first
processor to issue the "ibm,nmi-interlock" call. The second
processor waits till the first processor, which also
received a machine check error, is done reading the error
log. The first processor issues "ibm,nmi-interlock" call
when the error log is consumed. This patch implements the
releasing part of the error-log while subsequent patch
(which builds error log) handles the locking part.

Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
---
 hw/ppc/spapr.c         |    8 ++++++++
 hw/ppc/spapr_rtas.c    |   35 +++++++++++++++++++++++++++++++++++
 include/hw/ppc/spapr.h |   10 +++++++++-
 3 files changed, 52 insertions(+), 1 deletion(-)

Comments

David Gibson Aug. 17, 2017, 1:39 a.m. UTC | #1
What's with the extra spaces in the subject line?

On Wed, Aug 16, 2017 at 02:42:21PM +0530, Aravinda Prasad wrote:
> This patch adds support in QEMU to handle "ibm,nmi-register"
> and "ibm,nmi-interlock" RTAS calls.
> 
> The machine check notification address is saved when the
> OS issues "ibm,nmi-register" RTAS call.
> 
> This patch also handles the case when multiple processors
> experience machine check at or about the same time by
> handling "ibm,nmi-interlock" call. In such cases, as per
> PAPR, subsequent processors serialize waiting for the first
> processor to issue the "ibm,nmi-interlock" call. The second
> processor waits till the first processor, which also
> received a machine check error, is done reading the error
> log. The first processor issues "ibm,nmi-interlock" call
> when the error log is consumed. This patch implements the
> releasing part of the error-log while subsequent patch
> (which builds error log) handles the locking part.
> 
> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> ---
>  hw/ppc/spapr.c         |    8 ++++++++
>  hw/ppc/spapr_rtas.c    |   35 +++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h |   10 +++++++++-
>  3 files changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 2a3e53d..0bb2c4a 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1441,6 +1441,11 @@ static void ppc_spapr_reset(void)
>      first_ppc_cpu->env.nip = SPAPR_ENTRY_POINT;
>  
>      spapr->cas_reboot = false;
> +
> +    spapr->mc_in_progress = false;
> +    spapr->guest_machine_check_addr = 0;
> +    qemu_cond_destroy(&spapr->mc_delivery_cond);
> +    qemu_cond_init(&spapr->mc_delivery_cond);
>  }
>  
>  static void spapr_create_nvram(sPAPRMachineState *spapr)
> @@ -2491,6 +2496,9 @@ static void ppc_spapr_init(MachineState *machine)
>  
>          kvmppc_spapr_enable_inkernel_multitce();
>      }
> +
> +    spapr->mc_in_progress = false;
> +    qemu_cond_init(&spapr->mc_delivery_cond);
>  }
>  
>  static int spapr_kvm_type(const char *vm_type)
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index 94a2799..2f3c47b 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -348,6 +348,37 @@ static void rtas_get_power_level(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>      rtas_st(rets, 1, 100);
>  }
>  
> +static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
> +                                  sPAPRMachineState *spapr,
> +                                  uint32_t token, uint32_t nargs,
> +                                  target_ulong args,
> +                                  uint32_t nret, target_ulong rets)
> +{
> +    spapr->guest_machine_check_addr = rtas_ld(args, 1);
> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +}
> +
> +static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
> +                                   sPAPRMachineState *spapr,
> +                                   uint32_t token, uint32_t nargs,
> +                                   target_ulong args,
> +                                   uint32_t nret, target_ulong rets)
> +{
> +    if (!spapr->guest_machine_check_addr) {
> +        /* NMI register not called */
> +        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> +    } else {
> +        /*
> +         * VCPU issuing "ibm,nmi-interlock" is done with NMI handling,
> +         * hence unset mc_in_progress.
> +         */
> +        spapr->mc_in_progress = false;
> +        qemu_cond_signal(&spapr->mc_delivery_cond);
> +        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +    }
> +}
> +
> +
>  static struct rtas_call {
>      const char *name;
>      spapr_rtas_fn fn;
> @@ -489,6 +520,10 @@ static void core_rtas_register_types(void)
>                          rtas_set_power_level);
>      spapr_rtas_register(RTAS_GET_POWER_LEVEL, "get-power-level",
>                          rtas_get_power_level);
> +    spapr_rtas_register(RTAS_IBM_NMI_REGISTER, "ibm,nmi-register",
> +                        rtas_ibm_nmi_register);
> +    spapr_rtas_register(RTAS_IBM_NMI_INTERLOCK, "ibm,nmi-interlock",
> +                        rtas_ibm_nmi_interlock);
>  }
>  
>  type_init(core_rtas_register_types)
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 46012b3..eee8d33 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -123,6 +123,12 @@ struct sPAPRMachineState {
>       * occurs during the unplug process. */
>      QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
>  
> +    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
> +    target_ulong guest_machine_check_addr;
> +    bool mc_in_progress;
> +    int mc_cpu;

mc_cpu isn't actually used yet in this patch.  In any case it and
mc_in_progress could probably be folded together, no?

These values will also need to be migrated, AFAICT.

> +    QemuCond mc_delivery_cond;
> +
>      /*< public >*/
>      char *kvm_type;
>      MemoryHotplugState hotplug_memory;
> @@ -519,8 +525,10 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>  #define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
>  #define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
>  #define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
> +#define RTAS_IBM_NMI_REGISTER                   (RTAS_TOKEN_BASE + 0x2A)
> +#define RTAS_IBM_NMI_INTERLOCK                  (RTAS_TOKEN_BASE + 0x2B)
>  
> -#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
> +#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2C)
>  
>  /* RTAS ibm,get-system-parameter token values */
>  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
>
Aravinda Prasad Aug. 21, 2017, 12:35 p.m. UTC | #2
On Thursday 17 August 2017 07:09 AM, David Gibson wrote:
> What's with the extra spaces in the subject line?

I don't see any. Can you pls point out?

> 
> On Wed, Aug 16, 2017 at 02:42:21PM +0530, Aravinda Prasad wrote:
>> This patch adds support in QEMU to handle "ibm,nmi-register"
>> and "ibm,nmi-interlock" RTAS calls.
>>
>> The machine check notification address is saved when the
>> OS issues "ibm,nmi-register" RTAS call.
>>
>> This patch also handles the case when multiple processors
>> experience machine check at or about the same time by
>> handling "ibm,nmi-interlock" call. In such cases, as per
>> PAPR, subsequent processors serialize waiting for the first
>> processor to issue the "ibm,nmi-interlock" call. The second
>> processor waits till the first processor, which also
>> received a machine check error, is done reading the error
>> log. The first processor issues "ibm,nmi-interlock" call
>> when the error log is consumed. This patch implements the
>> releasing part of the error-log while subsequent patch
>> (which builds error log) handles the locking part.
>>
>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>> ---
>>  hw/ppc/spapr.c         |    8 ++++++++
>>  hw/ppc/spapr_rtas.c    |   35 +++++++++++++++++++++++++++++++++++
>>  include/hw/ppc/spapr.h |   10 +++++++++-
>>  3 files changed, 52 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 2a3e53d..0bb2c4a 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -1441,6 +1441,11 @@ static void ppc_spapr_reset(void)
>>      first_ppc_cpu->env.nip = SPAPR_ENTRY_POINT;
>>  
>>      spapr->cas_reboot = false;
>> +
>> +    spapr->mc_in_progress = false;
>> +    spapr->guest_machine_check_addr = 0;
>> +    qemu_cond_destroy(&spapr->mc_delivery_cond);
>> +    qemu_cond_init(&spapr->mc_delivery_cond);
>>  }
>>  
>>  static void spapr_create_nvram(sPAPRMachineState *spapr)
>> @@ -2491,6 +2496,9 @@ static void ppc_spapr_init(MachineState *machine)
>>  
>>          kvmppc_spapr_enable_inkernel_multitce();
>>      }
>> +
>> +    spapr->mc_in_progress = false;
>> +    qemu_cond_init(&spapr->mc_delivery_cond);
>>  }
>>  
>>  static int spapr_kvm_type(const char *vm_type)
>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
>> index 94a2799..2f3c47b 100644
>> --- a/hw/ppc/spapr_rtas.c
>> +++ b/hw/ppc/spapr_rtas.c
>> @@ -348,6 +348,37 @@ static void rtas_get_power_level(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>      rtas_st(rets, 1, 100);
>>  }
>>  
>> +static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
>> +                                  sPAPRMachineState *spapr,
>> +                                  uint32_t token, uint32_t nargs,
>> +                                  target_ulong args,
>> +                                  uint32_t nret, target_ulong rets)
>> +{
>> +    spapr->guest_machine_check_addr = rtas_ld(args, 1);
>> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +}
>> +
>> +static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>> +                                   sPAPRMachineState *spapr,
>> +                                   uint32_t token, uint32_t nargs,
>> +                                   target_ulong args,
>> +                                   uint32_t nret, target_ulong rets)
>> +{
>> +    if (!spapr->guest_machine_check_addr) {
>> +        /* NMI register not called */
>> +        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
>> +    } else {
>> +        /*
>> +         * VCPU issuing "ibm,nmi-interlock" is done with NMI handling,
>> +         * hence unset mc_in_progress.
>> +         */
>> +        spapr->mc_in_progress = false;
>> +        qemu_cond_signal(&spapr->mc_delivery_cond);
>> +        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +    }
>> +}
>> +
>> +
>>  static struct rtas_call {
>>      const char *name;
>>      spapr_rtas_fn fn;
>> @@ -489,6 +520,10 @@ static void core_rtas_register_types(void)
>>                          rtas_set_power_level);
>>      spapr_rtas_register(RTAS_GET_POWER_LEVEL, "get-power-level",
>>                          rtas_get_power_level);
>> +    spapr_rtas_register(RTAS_IBM_NMI_REGISTER, "ibm,nmi-register",
>> +                        rtas_ibm_nmi_register);
>> +    spapr_rtas_register(RTAS_IBM_NMI_INTERLOCK, "ibm,nmi-interlock",
>> +                        rtas_ibm_nmi_interlock);
>>  }
>>  
>>  type_init(core_rtas_register_types)
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 46012b3..eee8d33 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -123,6 +123,12 @@ struct sPAPRMachineState {
>>       * occurs during the unplug process. */
>>      QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
>>  
>> +    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
>> +    target_ulong guest_machine_check_addr;
>> +    bool mc_in_progress;
>> +    int mc_cpu;
> 
> mc_cpu isn't actually used yet in this patch.  In any case it and
> mc_in_progress could probably be folded together, no?

It is possible to fold mc_cpu and mc_in_progress together with the
convention that if it is set to -1 mc is not in progress otherwise it is
set to the CPU handling the mc.

> 
> These values will also need to be migrated, AFAICT.

I am thinking of how to handle the migration when machine check handling
is in progress. Probably wait for machine check handling to complete
before migrating as the error could be irrelevant once migrated to a new
hardware. If that is the case we don't need to migrate these values.

Regards,
Aravinda

> 
>> +    QemuCond mc_delivery_cond;
>> +
>>      /*< public >*/
>>      char *kvm_type;
>>      MemoryHotplugState hotplug_memory;
>> @@ -519,8 +525,10 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>>  #define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
>>  #define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
>>  #define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
>> +#define RTAS_IBM_NMI_REGISTER                   (RTAS_TOKEN_BASE + 0x2A)
>> +#define RTAS_IBM_NMI_INTERLOCK                  (RTAS_TOKEN_BASE + 0x2B)
>>  
>> -#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
>> +#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2C)
>>  
>>  /* RTAS ibm,get-system-parameter token values */
>>  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
>>
>
David Gibson Aug. 22, 2017, 2:08 a.m. UTC | #3
On Mon, Aug 21, 2017 at 06:05:34PM +0530, Aravinda Prasad wrote:
> 
> 
> On Thursday 17 August 2017 07:09 AM, David Gibson wrote:
> > What's with the extra spaces in the subject line?
> 
> I don't see any. Can you pls point out?

I see "ibm, nmi-register" and "ibm, nmi-interlock"

> 
> > 
> > On Wed, Aug 16, 2017 at 02:42:21PM +0530, Aravinda Prasad wrote:
> >> This patch adds support in QEMU to handle "ibm,nmi-register"
> >> and "ibm,nmi-interlock" RTAS calls.
> >>
> >> The machine check notification address is saved when the
> >> OS issues "ibm,nmi-register" RTAS call.
> >>
> >> This patch also handles the case when multiple processors
> >> experience machine check at or about the same time by
> >> handling "ibm,nmi-interlock" call. In such cases, as per
> >> PAPR, subsequent processors serialize waiting for the first
> >> processor to issue the "ibm,nmi-interlock" call. The second
> >> processor waits till the first processor, which also
> >> received a machine check error, is done reading the error
> >> log. The first processor issues "ibm,nmi-interlock" call
> >> when the error log is consumed. This patch implements the
> >> releasing part of the error-log while subsequent patch
> >> (which builds error log) handles the locking part.
> >>
> >> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
> >> ---
> >>  hw/ppc/spapr.c         |    8 ++++++++
> >>  hw/ppc/spapr_rtas.c    |   35 +++++++++++++++++++++++++++++++++++
> >>  include/hw/ppc/spapr.h |   10 +++++++++-
> >>  3 files changed, 52 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index 2a3e53d..0bb2c4a 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -1441,6 +1441,11 @@ static void ppc_spapr_reset(void)
> >>      first_ppc_cpu->env.nip = SPAPR_ENTRY_POINT;
> >>  
> >>      spapr->cas_reboot = false;
> >> +
> >> +    spapr->mc_in_progress = false;
> >> +    spapr->guest_machine_check_addr = 0;
> >> +    qemu_cond_destroy(&spapr->mc_delivery_cond);
> >> +    qemu_cond_init(&spapr->mc_delivery_cond);
> >>  }
> >>  
> >>  static void spapr_create_nvram(sPAPRMachineState *spapr)
> >> @@ -2491,6 +2496,9 @@ static void ppc_spapr_init(MachineState *machine)
> >>  
> >>          kvmppc_spapr_enable_inkernel_multitce();
> >>      }
> >> +
> >> +    spapr->mc_in_progress = false;
> >> +    qemu_cond_init(&spapr->mc_delivery_cond);
> >>  }
> >>  
> >>  static int spapr_kvm_type(const char *vm_type)
> >> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> >> index 94a2799..2f3c47b 100644
> >> --- a/hw/ppc/spapr_rtas.c
> >> +++ b/hw/ppc/spapr_rtas.c
> >> @@ -348,6 +348,37 @@ static void rtas_get_power_level(PowerPCCPU *cpu, sPAPRMachineState *spapr,
> >>      rtas_st(rets, 1, 100);
> >>  }
> >>  
> >> +static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
> >> +                                  sPAPRMachineState *spapr,
> >> +                                  uint32_t token, uint32_t nargs,
> >> +                                  target_ulong args,
> >> +                                  uint32_t nret, target_ulong rets)
> >> +{
> >> +    spapr->guest_machine_check_addr = rtas_ld(args, 1);
> >> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >> +}
> >> +
> >> +static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
> >> +                                   sPAPRMachineState *spapr,
> >> +                                   uint32_t token, uint32_t nargs,
> >> +                                   target_ulong args,
> >> +                                   uint32_t nret, target_ulong rets)
> >> +{
> >> +    if (!spapr->guest_machine_check_addr) {
> >> +        /* NMI register not called */
> >> +        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
> >> +    } else {
> >> +        /*
> >> +         * VCPU issuing "ibm,nmi-interlock" is done with NMI handling,
> >> +         * hence unset mc_in_progress.
> >> +         */
> >> +        spapr->mc_in_progress = false;
> >> +        qemu_cond_signal(&spapr->mc_delivery_cond);
> >> +        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> >> +    }
> >> +}
> >> +
> >> +
> >>  static struct rtas_call {
> >>      const char *name;
> >>      spapr_rtas_fn fn;
> >> @@ -489,6 +520,10 @@ static void core_rtas_register_types(void)
> >>                          rtas_set_power_level);
> >>      spapr_rtas_register(RTAS_GET_POWER_LEVEL, "get-power-level",
> >>                          rtas_get_power_level);
> >> +    spapr_rtas_register(RTAS_IBM_NMI_REGISTER, "ibm,nmi-register",
> >> +                        rtas_ibm_nmi_register);
> >> +    spapr_rtas_register(RTAS_IBM_NMI_INTERLOCK, "ibm,nmi-interlock",
> >> +                        rtas_ibm_nmi_interlock);
> >>  }
> >>  
> >>  type_init(core_rtas_register_types)
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index 46012b3..eee8d33 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -123,6 +123,12 @@ struct sPAPRMachineState {
> >>       * occurs during the unplug process. */
> >>      QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
> >>  
> >> +    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
> >> +    target_ulong guest_machine_check_addr;
> >> +    bool mc_in_progress;
> >> +    int mc_cpu;
> > 
> > mc_cpu isn't actually used yet in this patch.  In any case it and
> > mc_in_progress could probably be folded together, no?
> 
> It is possible to fold mc_cpu and mc_in_progress together with the
> convention that if it is set to -1 mc is not in progress otherwise it is
> set to the CPU handling the mc.
> 
> > 
> > These values will also need to be migrated, AFAICT.
> 
> I am thinking of how to handle the migration when machine check handling
> is in progress. Probably wait for machine check handling to complete
> before migrating as the error could be irrelevant once migrated to a new
> hardware. If that is the case we don't need to migrate these values.

Ok.

> 
> Regards,
> Aravinda
> 
> > 
> >> +    QemuCond mc_delivery_cond;
> >> +
> >>      /*< public >*/
> >>      char *kvm_type;
> >>      MemoryHotplugState hotplug_memory;
> >> @@ -519,8 +525,10 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
> >>  #define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
> >>  #define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
> >>  #define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
> >> +#define RTAS_IBM_NMI_REGISTER                   (RTAS_TOKEN_BASE + 0x2A)
> >> +#define RTAS_IBM_NMI_INTERLOCK                  (RTAS_TOKEN_BASE + 0x2B)
> >>  
> >> -#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
> >> +#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2C)
> >>  
> >>  /* RTAS ibm,get-system-parameter token values */
> >>  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
> >>
> > 
>
Aravinda Prasad Aug. 22, 2017, 7:12 a.m. UTC | #4
On Tuesday 22 August 2017 07:38 AM, David Gibson wrote:
> On Mon, Aug 21, 2017 at 06:05:34PM +0530, Aravinda Prasad wrote:
>>
>>
>> On Thursday 17 August 2017 07:09 AM, David Gibson wrote:
>>> What's with the extra spaces in the subject line?
>>
>> I don't see any. Can you pls point out?
> 
> I see "ibm, nmi-register" and "ibm, nmi-interlock"

Ah.. space after comma. Will correct it.

Regards,
Aravinda

> 
>>
>>>
>>> On Wed, Aug 16, 2017 at 02:42:21PM +0530, Aravinda Prasad wrote:
>>>> This patch adds support in QEMU to handle "ibm,nmi-register"
>>>> and "ibm,nmi-interlock" RTAS calls.
>>>>
>>>> The machine check notification address is saved when the
>>>> OS issues "ibm,nmi-register" RTAS call.
>>>>
>>>> This patch also handles the case when multiple processors
>>>> experience machine check at or about the same time by
>>>> handling "ibm,nmi-interlock" call. In such cases, as per
>>>> PAPR, subsequent processors serialize waiting for the first
>>>> processor to issue the "ibm,nmi-interlock" call. The second
>>>> processor waits till the first processor, which also
>>>> received a machine check error, is done reading the error
>>>> log. The first processor issues "ibm,nmi-interlock" call
>>>> when the error log is consumed. This patch implements the
>>>> releasing part of the error-log while subsequent patch
>>>> (which builds error log) handles the locking part.
>>>>
>>>> Signed-off-by: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
>>>> ---
>>>>  hw/ppc/spapr.c         |    8 ++++++++
>>>>  hw/ppc/spapr_rtas.c    |   35 +++++++++++++++++++++++++++++++++++
>>>>  include/hw/ppc/spapr.h |   10 +++++++++-
>>>>  3 files changed, 52 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>> index 2a3e53d..0bb2c4a 100644
>>>> --- a/hw/ppc/spapr.c
>>>> +++ b/hw/ppc/spapr.c
>>>> @@ -1441,6 +1441,11 @@ static void ppc_spapr_reset(void)
>>>>      first_ppc_cpu->env.nip = SPAPR_ENTRY_POINT;
>>>>  
>>>>      spapr->cas_reboot = false;
>>>> +
>>>> +    spapr->mc_in_progress = false;
>>>> +    spapr->guest_machine_check_addr = 0;
>>>> +    qemu_cond_destroy(&spapr->mc_delivery_cond);
>>>> +    qemu_cond_init(&spapr->mc_delivery_cond);
>>>>  }
>>>>  
>>>>  static void spapr_create_nvram(sPAPRMachineState *spapr)
>>>> @@ -2491,6 +2496,9 @@ static void ppc_spapr_init(MachineState *machine)
>>>>  
>>>>          kvmppc_spapr_enable_inkernel_multitce();
>>>>      }
>>>> +
>>>> +    spapr->mc_in_progress = false;
>>>> +    qemu_cond_init(&spapr->mc_delivery_cond);
>>>>  }
>>>>  
>>>>  static int spapr_kvm_type(const char *vm_type)
>>>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
>>>> index 94a2799..2f3c47b 100644
>>>> --- a/hw/ppc/spapr_rtas.c
>>>> +++ b/hw/ppc/spapr_rtas.c
>>>> @@ -348,6 +348,37 @@ static void rtas_get_power_level(PowerPCCPU *cpu, sPAPRMachineState *spapr,
>>>>      rtas_st(rets, 1, 100);
>>>>  }
>>>>  
>>>> +static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
>>>> +                                  sPAPRMachineState *spapr,
>>>> +                                  uint32_t token, uint32_t nargs,
>>>> +                                  target_ulong args,
>>>> +                                  uint32_t nret, target_ulong rets)
>>>> +{
>>>> +    spapr->guest_machine_check_addr = rtas_ld(args, 1);
>>>> +    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>>>> +}
>>>> +
>>>> +static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>>>> +                                   sPAPRMachineState *spapr,
>>>> +                                   uint32_t token, uint32_t nargs,
>>>> +                                   target_ulong args,
>>>> +                                   uint32_t nret, target_ulong rets)
>>>> +{
>>>> +    if (!spapr->guest_machine_check_addr) {
>>>> +        /* NMI register not called */
>>>> +        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
>>>> +    } else {
>>>> +        /*
>>>> +         * VCPU issuing "ibm,nmi-interlock" is done with NMI handling,
>>>> +         * hence unset mc_in_progress.
>>>> +         */
>>>> +        spapr->mc_in_progress = false;
>>>> +        qemu_cond_signal(&spapr->mc_delivery_cond);
>>>> +        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>>>> +    }
>>>> +}
>>>> +
>>>> +
>>>>  static struct rtas_call {
>>>>      const char *name;
>>>>      spapr_rtas_fn fn;
>>>> @@ -489,6 +520,10 @@ static void core_rtas_register_types(void)
>>>>                          rtas_set_power_level);
>>>>      spapr_rtas_register(RTAS_GET_POWER_LEVEL, "get-power-level",
>>>>                          rtas_get_power_level);
>>>> +    spapr_rtas_register(RTAS_IBM_NMI_REGISTER, "ibm,nmi-register",
>>>> +                        rtas_ibm_nmi_register);
>>>> +    spapr_rtas_register(RTAS_IBM_NMI_INTERLOCK, "ibm,nmi-interlock",
>>>> +                        rtas_ibm_nmi_interlock);
>>>>  }
>>>>  
>>>>  type_init(core_rtas_register_types)
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index 46012b3..eee8d33 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -123,6 +123,12 @@ struct sPAPRMachineState {
>>>>       * occurs during the unplug process. */
>>>>      QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
>>>>  
>>>> +    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
>>>> +    target_ulong guest_machine_check_addr;
>>>> +    bool mc_in_progress;
>>>> +    int mc_cpu;
>>>
>>> mc_cpu isn't actually used yet in this patch.  In any case it and
>>> mc_in_progress could probably be folded together, no?
>>
>> It is possible to fold mc_cpu and mc_in_progress together with the
>> convention that if it is set to -1 mc is not in progress otherwise it is
>> set to the CPU handling the mc.
>>
>>>
>>> These values will also need to be migrated, AFAICT.
>>
>> I am thinking of how to handle the migration when machine check handling
>> is in progress. Probably wait for machine check handling to complete
>> before migrating as the error could be irrelevant once migrated to a new
>> hardware. If that is the case we don't need to migrate these values.
> 
> Ok.
> 
>>
>> Regards,
>> Aravinda
>>
>>>
>>>> +    QemuCond mc_delivery_cond;
>>>> +
>>>>      /*< public >*/
>>>>      char *kvm_type;
>>>>      MemoryHotplugState hotplug_memory;
>>>> @@ -519,8 +525,10 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>>>>  #define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
>>>>  #define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
>>>>  #define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
>>>> +#define RTAS_IBM_NMI_REGISTER                   (RTAS_TOKEN_BASE + 0x2A)
>>>> +#define RTAS_IBM_NMI_INTERLOCK                  (RTAS_TOKEN_BASE + 0x2B)
>>>>  
>>>> -#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
>>>> +#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2C)
>>>>  
>>>>  /* RTAS ibm,get-system-parameter token values */
>>>>  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
>>>>
>>>
>>
>
Aravinda Prasad Sept. 21, 2017, 9:09 a.m. UTC | #5
On Tuesday 22 August 2017 07:38 AM, David Gibson wrote:

[ . . . ]

>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index 46012b3..eee8d33 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -123,6 +123,12 @@ struct sPAPRMachineState {
>>>>       * occurs during the unplug process. */
>>>>      QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
>>>>  
>>>> +    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
>>>> +    target_ulong guest_machine_check_addr;
>>>> +    bool mc_in_progress;
>>>> +    int mc_cpu;
>>>
>>> mc_cpu isn't actually used yet in this patch.  In any case it and
>>> mc_in_progress could probably be folded together, no?
>>
>> It is possible to fold mc_cpu and mc_in_progress together with the
>> convention that if it is set to -1 mc is not in progress otherwise it is
>> set to the CPU handling the mc.
>>
>>>
>>> These values will also need to be migrated, AFAICT.
>>
>> I am thinking of how to handle the migration when machine check handling
>> is in progress. Probably wait for machine check handling to complete
>> before migrating as the error could be irrelevant once migrated to a new
>> hardware. If that is the case we don't need to migrate these values.
> 
> Ok.

This is what I think about handling machine check during migration based
on my understanding of the VM migration code.

There are two possibilities here. First, migration can be initiated
while the machine check handling is in progress. Second, A machine check
error can happen when the migration is in progress.

To handle the first case we can add migrate_add_blocker() call when we
start handling the machine check error and issue migrate_del_blocker()
when done. I think this should solve the issue.

The second case is bit tricky. The migration has already started and
hence migrate_add_blocker() call will fail. We also cannot wait till the
completion of the migration to handle machine check error as the VM's
data could be corrupt.

Machine check errors should not be an issue when the migration is in the
RAM copy phase as VM is still active with vCPUs running. The problem is
when we hit a machine check when the migration is about to complete. For
example,

1. vCPU2 hits a machine check error during migration.

2. KVM causes VM exit on vCPU2 and the NIP of vCPU2 is changed to the
guest registered machine check handler.

3. The migration_completion() issues vm_stop() and hence either vCPU2 is
never scheduled again on the source hardware or vCPU2 is preempted while
executing the machine check handler.

4. vCPU2 is resumed on the target hardware and either starts or
continues processing the machine check error. This could be a problem as
these errors are specific to the source hardware. For instance, when the
the guest issues memory poisoning upon such error, a clean page on the
target hardware is poisoned while the corrupt page on source hardware is
not poisoned.

The second case of hitting machine check during the final phase of
migration is rare but wanted to check what others think about it.

Regards,
Aravinda

> 
>>
>> Regards,
>> Aravinda
>>
>>>
>>>> +    QemuCond mc_delivery_cond;
>>>> +
>>>>      /*< public >*/
>>>>      char *kvm_type;
>>>>      MemoryHotplugState hotplug_memory;
>>>> @@ -519,8 +525,10 @@ target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
>>>>  #define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
>>>>  #define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
>>>>  #define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
>>>> +#define RTAS_IBM_NMI_REGISTER                   (RTAS_TOKEN_BASE + 0x2A)
>>>> +#define RTAS_IBM_NMI_INTERLOCK                  (RTAS_TOKEN_BASE + 0x2B)
>>>>  
>>>> -#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
>>>> +#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2C)
>>>>  
>>>>  /* RTAS ibm,get-system-parameter token values */
>>>>  #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20
>>>>
>>>
>>
>
David Gibson Sept. 27, 2017, 7:15 a.m. UTC | #6
On Thu, Sep 21, 2017 at 02:39:06PM +0530, Aravinda Prasad wrote:
> 
> 
> On Tuesday 22 August 2017 07:38 AM, David Gibson wrote:
> 
> [ . . . ]
> 
> >>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >>>> index 46012b3..eee8d33 100644
> >>>> --- a/include/hw/ppc/spapr.h
> >>>> +++ b/include/hw/ppc/spapr.h
> >>>> @@ -123,6 +123,12 @@ struct sPAPRMachineState {
> >>>>       * occurs during the unplug process. */
> >>>>      QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
> >>>>  
> >>>> +    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
> >>>> +    target_ulong guest_machine_check_addr;
> >>>> +    bool mc_in_progress;
> >>>> +    int mc_cpu;
> >>>
> >>> mc_cpu isn't actually used yet in this patch.  In any case it and
> >>> mc_in_progress could probably be folded together, no?
> >>
> >> It is possible to fold mc_cpu and mc_in_progress together with the
> >> convention that if it is set to -1 mc is not in progress otherwise it is
> >> set to the CPU handling the mc.
> >>
> >>>
> >>> These values will also need to be migrated, AFAICT.
> >>
> >> I am thinking of how to handle the migration when machine check handling
> >> is in progress. Probably wait for machine check handling to complete
> >> before migrating as the error could be irrelevant once migrated to a new
> >> hardware. If that is the case we don't need to migrate these values.
> > 
> > Ok.
> 
> This is what I think about handling machine check during migration based
> on my understanding of the VM migration code.
> 
> There are two possibilities here. First, migration can be initiated
> while the machine check handling is in progress. Second, A machine check
> error can happen when the migration is in progress.
> 
> To handle the first case we can add migrate_add_blocker() call when we
> start handling the machine check error and issue migrate_del_blocker()
> when done. I think this should solve the issue.
> 
> The second case is bit tricky. The migration has already started and
> hence migrate_add_blocker() call will fail. We also cannot wait till the
> completion of the migration to handle machine check error as the VM's
> data could be corrupt.
> 
> Machine check errors should not be an issue when the migration is in the
> RAM copy phase as VM is still active with vCPUs running. The problem is
> when we hit a machine check when the migration is about to complete. For
> example,
> 
> 1. vCPU2 hits a machine check error during migration.
> 
> 2. KVM causes VM exit on vCPU2 and the NIP of vCPU2 is changed to the
> guest registered machine check handler.
> 
> 3. The migration_completion() issues vm_stop() and hence either vCPU2 is
> never scheduled again on the source hardware or vCPU2 is preempted while
> executing the machine check handler.
> 
> 4. vCPU2 is resumed on the target hardware and either starts or
> continues processing the machine check error. This could be a problem as
> these errors are specific to the source hardware. For instance, when the
> the guest issues memory poisoning upon such error, a clean page on the
> target hardware is poisoned while the corrupt page on source hardware is
> not poisoned.
> 
> The second case of hitting machine check during the final phase of
> migration is rare but wanted to check what others think about it.

So, I've had a bit of a think about this.  I don't recall if these
fwnmi machine checks are expected on guest RAM, or guest IO addresses.

1) If RAM

  What exactly is the guest's notification for?  Even without
  migration, the host's free to move guest memory around in host
  memory, so it seems any hardware level poking should be done on the
  host side.

  Is it just to notify the guest that we weren't able to fully recover
  on the host side and that page may contain corrupted data?  If
  that's so then it seems resuming the handling on the destination is
  still right.  It may be new good RAM, but the contents we migrated
  could still be corrupt from the machine check event on the source.

2) If IO

  AFAICT this could only happen with VFIO passthrough devices.. but it
  shouldn't be possible to migrate if there are any of those.

Or have I missed something..
Aravinda Prasad Sept. 27, 2017, 11:53 a.m. UTC | #7
On Wednesday 27 September 2017 12:45 PM, David Gibson wrote:
> On Thu, Sep 21, 2017 at 02:39:06PM +0530, Aravinda Prasad wrote:
>>
>>
>> On Tuesday 22 August 2017 07:38 AM, David Gibson wrote:
>>
>> [ . . . ]
>>
>>>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>>>> index 46012b3..eee8d33 100644
>>>>>> --- a/include/hw/ppc/spapr.h
>>>>>> +++ b/include/hw/ppc/spapr.h
>>>>>> @@ -123,6 +123,12 @@ struct sPAPRMachineState {
>>>>>>       * occurs during the unplug process. */
>>>>>>      QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
>>>>>>  
>>>>>> +    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
>>>>>> +    target_ulong guest_machine_check_addr;
>>>>>> +    bool mc_in_progress;
>>>>>> +    int mc_cpu;
>>>>>
>>>>> mc_cpu isn't actually used yet in this patch.  In any case it and
>>>>> mc_in_progress could probably be folded together, no?
>>>>
>>>> It is possible to fold mc_cpu and mc_in_progress together with the
>>>> convention that if it is set to -1 mc is not in progress otherwise it is
>>>> set to the CPU handling the mc.
>>>>
>>>>>
>>>>> These values will also need to be migrated, AFAICT.
>>>>
>>>> I am thinking of how to handle the migration when machine check handling
>>>> is in progress. Probably wait for machine check handling to complete
>>>> before migrating as the error could be irrelevant once migrated to a new
>>>> hardware. If that is the case we don't need to migrate these values.
>>>
>>> Ok.
>>
>> This is what I think about handling machine check during migration based
>> on my understanding of the VM migration code.
>>
>> There are two possibilities here. First, migration can be initiated
>> while the machine check handling is in progress. Second, A machine check
>> error can happen when the migration is in progress.
>>
>> To handle the first case we can add migrate_add_blocker() call when we
>> start handling the machine check error and issue migrate_del_blocker()
>> when done. I think this should solve the issue.
>>
>> The second case is bit tricky. The migration has already started and
>> hence migrate_add_blocker() call will fail. We also cannot wait till the
>> completion of the migration to handle machine check error as the VM's
>> data could be corrupt.
>>
>> Machine check errors should not be an issue when the migration is in the
>> RAM copy phase as VM is still active with vCPUs running. The problem is
>> when we hit a machine check when the migration is about to complete. For
>> example,
>>
>> 1. vCPU2 hits a machine check error during migration.
>>
>> 2. KVM causes VM exit on vCPU2 and the NIP of vCPU2 is changed to the
>> guest registered machine check handler.
>>
>> 3. The migration_completion() issues vm_stop() and hence either vCPU2 is
>> never scheduled again on the source hardware or vCPU2 is preempted while
>> executing the machine check handler.
>>
>> 4. vCPU2 is resumed on the target hardware and either starts or
>> continues processing the machine check error. This could be a problem as
>> these errors are specific to the source hardware. For instance, when the
>> the guest issues memory poisoning upon such error, a clean page on the
>> target hardware is poisoned while the corrupt page on source hardware is
>> not poisoned.
>>
>> The second case of hitting machine check during the final phase of
>> migration is rare but wanted to check what others think about it.
> 
> So, I've had a bit of a think about this.  I don't recall if these
> fwnmi machine checks are expected on guest RAM, or guest IO addresses.

It is expected on guest RAM. I am not sure about guest IO address.

> 
> 1) If RAM
> 
>   What exactly is the guest's notification for?  Even without
>   migration, the host's free to move guest memory around in host
>   memory, so it seems any hardware level poking should be done on the
>   host side.

If the error is a correctable error, then host takes care of it by
moving the page to a different location, the guest need not be and will
not be notified. Guest will be notified if host is not able to fully
recover. Hence we hit FWNMI in guest when RAM errors are not recovered
by the host.

> 
>   Is it just to notify the guest that we weren't able to fully recover
>   on the host side and that page may contain corrupted data?  If
>   that's so then it seems resuming the handling on the destination is
>   still right.  It may be new good RAM, but the contents we migrated
>   could still be corrupt from the machine check event on the source.

Yes. This is what I am doing in my v5 patch set which I am about to
post. Additionally I block migration when processing machine check errors.

> 
> 2) If IO
> 
>   AFAICT this could only happen with VFIO passthrough devices.. but it
>   shouldn't be possible to migrate if there are any of those.
> 

I am not very sure about IO errors.

Regards,
Aravinda

> Or have I missed something..
>
David Gibson Sept. 28, 2017, 3:58 a.m. UTC | #8
On Wed, Sep 27, 2017 at 05:23:51PM +0530, Aravinda Prasad wrote:
> 
> 
> On Wednesday 27 September 2017 12:45 PM, David Gibson wrote:
> > On Thu, Sep 21, 2017 at 02:39:06PM +0530, Aravinda Prasad wrote:
> >>
> >>
> >> On Tuesday 22 August 2017 07:38 AM, David Gibson wrote:
> >>
> >> [ . . . ]
> >>
> >>>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >>>>>> index 46012b3..eee8d33 100644
> >>>>>> --- a/include/hw/ppc/spapr.h
> >>>>>> +++ b/include/hw/ppc/spapr.h
> >>>>>> @@ -123,6 +123,12 @@ struct sPAPRMachineState {
> >>>>>>       * occurs during the unplug process. */
> >>>>>>      QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
> >>>>>>  
> >>>>>> +    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
> >>>>>> +    target_ulong guest_machine_check_addr;
> >>>>>> +    bool mc_in_progress;
> >>>>>> +    int mc_cpu;
> >>>>>
> >>>>> mc_cpu isn't actually used yet in this patch.  In any case it and
> >>>>> mc_in_progress could probably be folded together, no?
> >>>>
> >>>> It is possible to fold mc_cpu and mc_in_progress together with the
> >>>> convention that if it is set to -1 mc is not in progress otherwise it is
> >>>> set to the CPU handling the mc.
> >>>>
> >>>>>
> >>>>> These values will also need to be migrated, AFAICT.
> >>>>
> >>>> I am thinking of how to handle the migration when machine check handling
> >>>> is in progress. Probably wait for machine check handling to complete
> >>>> before migrating as the error could be irrelevant once migrated to a new
> >>>> hardware. If that is the case we don't need to migrate these values.
> >>>
> >>> Ok.
> >>
> >> This is what I think about handling machine check during migration based
> >> on my understanding of the VM migration code.
> >>
> >> There are two possibilities here. First, migration can be initiated
> >> while the machine check handling is in progress. Second, A machine check
> >> error can happen when the migration is in progress.
> >>
> >> To handle the first case we can add migrate_add_blocker() call when we
> >> start handling the machine check error and issue migrate_del_blocker()
> >> when done. I think this should solve the issue.
> >>
> >> The second case is bit tricky. The migration has already started and
> >> hence migrate_add_blocker() call will fail. We also cannot wait till the
> >> completion of the migration to handle machine check error as the VM's
> >> data could be corrupt.
> >>
> >> Machine check errors should not be an issue when the migration is in the
> >> RAM copy phase as VM is still active with vCPUs running. The problem is
> >> when we hit a machine check when the migration is about to complete. For
> >> example,
> >>
> >> 1. vCPU2 hits a machine check error during migration.
> >>
> >> 2. KVM causes VM exit on vCPU2 and the NIP of vCPU2 is changed to the
> >> guest registered machine check handler.
> >>
> >> 3. The migration_completion() issues vm_stop() and hence either vCPU2 is
> >> never scheduled again on the source hardware or vCPU2 is preempted while
> >> executing the machine check handler.
> >>
> >> 4. vCPU2 is resumed on the target hardware and either starts or
> >> continues processing the machine check error. This could be a problem as
> >> these errors are specific to the source hardware. For instance, when the
> >> the guest issues memory poisoning upon such error, a clean page on the
> >> target hardware is poisoned while the corrupt page on source hardware is
> >> not poisoned.
> >>
> >> The second case of hitting machine check during the final phase of
> >> migration is rare but wanted to check what others think about it.
> > 
> > So, I've had a bit of a think about this.  I don't recall if these
> > fwnmi machine checks are expected on guest RAM, or guest IO addresses.
> 
> It is expected on guest RAM. I am not sure about guest IO address.
> 
> > 
> > 1) If RAM
> > 
> >   What exactly is the guest's notification for?  Even without
> >   migration, the host's free to move guest memory around in host
> >   memory, so it seems any hardware level poking should be done on the
> >   host side.
> 
> If the error is a correctable error, then host takes care of it by
> moving the page to a different location, the guest need not be and will
> not be notified. Guest will be notified if host is not able to fully
> recover. Hence we hit FWNMI in guest when RAM errors are not recovered
> by the host.

Ok.

> >   Is it just to notify the guest that we weren't able to fully recover
> >   on the host side and that page may contain corrupted data?  If
> >   that's so then it seems resuming the handling on the destination is
> >   still right.  It may be new good RAM, but the contents we migrated
> >   could still be corrupt from the machine check event on the source.
> 
> Yes. This is what I am doing in my v5 patch set which I am about to
> post. Additionally I block migration when processing machine check errors.
> 
> > 
> > 2) If IO
> > 
> >   AFAICT this could only happen with VFIO passthrough devices.. but it
> >   shouldn't be possible to migrate if there are any of those.
> > 
> 
> I am not very sure about IO errors.

Ok.  It sounds like that's not the primary case you're interested, so
I guess we can ignore it for now.
diff mbox

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 2a3e53d..0bb2c4a 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1441,6 +1441,11 @@  static void ppc_spapr_reset(void)
     first_ppc_cpu->env.nip = SPAPR_ENTRY_POINT;
 
     spapr->cas_reboot = false;
+
+    spapr->mc_in_progress = false;
+    spapr->guest_machine_check_addr = 0;
+    qemu_cond_destroy(&spapr->mc_delivery_cond);
+    qemu_cond_init(&spapr->mc_delivery_cond);
 }
 
 static void spapr_create_nvram(sPAPRMachineState *spapr)
@@ -2491,6 +2496,9 @@  static void ppc_spapr_init(MachineState *machine)
 
         kvmppc_spapr_enable_inkernel_multitce();
     }
+
+    spapr->mc_in_progress = false;
+    qemu_cond_init(&spapr->mc_delivery_cond);
 }
 
 static int spapr_kvm_type(const char *vm_type)
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index 94a2799..2f3c47b 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -348,6 +348,37 @@  static void rtas_get_power_level(PowerPCCPU *cpu, sPAPRMachineState *spapr,
     rtas_st(rets, 1, 100);
 }
 
+static void rtas_ibm_nmi_register(PowerPCCPU *cpu,
+                                  sPAPRMachineState *spapr,
+                                  uint32_t token, uint32_t nargs,
+                                  target_ulong args,
+                                  uint32_t nret, target_ulong rets)
+{
+    spapr->guest_machine_check_addr = rtas_ld(args, 1);
+    rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+}
+
+static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
+                                   sPAPRMachineState *spapr,
+                                   uint32_t token, uint32_t nargs,
+                                   target_ulong args,
+                                   uint32_t nret, target_ulong rets)
+{
+    if (!spapr->guest_machine_check_addr) {
+        /* NMI register not called */
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+    } else {
+        /*
+         * VCPU issuing "ibm,nmi-interlock" is done with NMI handling,
+         * hence unset mc_in_progress.
+         */
+        spapr->mc_in_progress = false;
+        qemu_cond_signal(&spapr->mc_delivery_cond);
+        rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    }
+}
+
+
 static struct rtas_call {
     const char *name;
     spapr_rtas_fn fn;
@@ -489,6 +520,10 @@  static void core_rtas_register_types(void)
                         rtas_set_power_level);
     spapr_rtas_register(RTAS_GET_POWER_LEVEL, "get-power-level",
                         rtas_get_power_level);
+    spapr_rtas_register(RTAS_IBM_NMI_REGISTER, "ibm,nmi-register",
+                        rtas_ibm_nmi_register);
+    spapr_rtas_register(RTAS_IBM_NMI_INTERLOCK, "ibm,nmi-interlock",
+                        rtas_ibm_nmi_interlock);
 }
 
 type_init(core_rtas_register_types)
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 46012b3..eee8d33 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -123,6 +123,12 @@  struct sPAPRMachineState {
      * occurs during the unplug process. */
     QTAILQ_HEAD(, sPAPRDIMMState) pending_dimm_unplugs;
 
+    /* State related to "ibm,nmi-register" and "ibm,nmi-interlock" calls */
+    target_ulong guest_machine_check_addr;
+    bool mc_in_progress;
+    int mc_cpu;
+    QemuCond mc_delivery_cond;
+
     /*< public >*/
     char *kvm_type;
     MemoryHotplugState hotplug_memory;
@@ -519,8 +525,10 @@  target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
 #define RTAS_IBM_CREATE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x27)
 #define RTAS_IBM_REMOVE_PE_DMA_WINDOW           (RTAS_TOKEN_BASE + 0x28)
 #define RTAS_IBM_RESET_PE_DMA_WINDOW            (RTAS_TOKEN_BASE + 0x29)
+#define RTAS_IBM_NMI_REGISTER                   (RTAS_TOKEN_BASE + 0x2A)
+#define RTAS_IBM_NMI_INTERLOCK                  (RTAS_TOKEN_BASE + 0x2B)
 
-#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2A)
+#define RTAS_TOKEN_MAX                          (RTAS_TOKEN_BASE + 0x2C)
 
 /* RTAS ibm,get-system-parameter token values */
 #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS      20