diff mbox series

[v20,6/7] migration: Include migration support for machine check handling

Message ID 20200117093855.19074-7-ganeshgr@linux.ibm.com
State New
Headers show
Series target-ppc/spapr: Add FWNMI support in QEMU for PowerKVM guests | expand

Commit Message

Ganesh Goudar Jan. 17, 2020, 9:38 a.m. UTC
From: Aravinda Prasad <arawinda.p@gmail.com>

This patch includes migration support for machine check
handling. Especially this patch blocks VM migration
requests until the machine check error handling is
complete as these errors are specific to the source
hardware and is irrelevant on the target hardware.

Signed-off-by: Aravinda Prasad <arawinda.p@gmail.com>
[Do not set FWNMI cap in post_load, now its done in .apply hook]
Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
 hw/ppc/spapr.c         | 47 ++++++++++++++++++++++++++++++++++++++++++
 hw/ppc/spapr_events.c  | 16 +++++++++++++-
 hw/ppc/spapr_rtas.c    |  2 ++
 include/hw/ppc/spapr.h |  2 ++
 4 files changed, 66 insertions(+), 1 deletion(-)

Comments

David Gibson Jan. 20, 2020, 12:48 a.m. UTC | #1
On Fri, Jan 17, 2020 at 03:08:54PM +0530, Ganesh Goudar wrote:
> From: Aravinda Prasad <arawinda.p@gmail.com>
> 
> This patch includes migration support for machine check
> handling. Especially this patch blocks VM migration
> requests until the machine check error handling is
> complete as these errors are specific to the source
> hardware and is irrelevant on the target hardware.
> 
> Signed-off-by: Aravinda Prasad <arawinda.p@gmail.com>
> [Do not set FWNMI cap in post_load, now its done in .apply hook]
> Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
> ---
>  hw/ppc/spapr.c         | 47 ++++++++++++++++++++++++++++++++++++++++++
>  hw/ppc/spapr_events.c  | 16 +++++++++++++-
>  hw/ppc/spapr_rtas.c    |  2 ++
>  include/hw/ppc/spapr.h |  2 ++
>  4 files changed, 66 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 46bac1a83c..c8bc2fa9f3 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -46,6 +46,7 @@
>  #include "migration/qemu-file-types.h"
>  #include "migration/global_state.h"
>  #include "migration/register.h"
> +#include "migration/blocker.h"
>  #include "mmu-hash64.h"
>  #include "mmu-book3s-v3.h"
>  #include "cpu-models.h"
> @@ -1683,6 +1684,8 @@ static void spapr_machine_reset(MachineState *machine)
>  
>      /* Signal all vCPUs waiting on this condition */
>      qemu_cond_broadcast(&spapr->mc_delivery_cond);
> +
> +    migrate_del_blocker(spapr->fwnmi_migration_blocker);

Thinking on our discussions about this earlier, there are
circumstances where we could add the blocker message multiple times.
IIUC, this will just remove one of them, but at reset, we need to
remove all of them.

>  }
>  
>  static void spapr_create_nvram(SpaprMachineState *spapr)
> @@ -1965,6 +1968,42 @@ static const VMStateDescription vmstate_spapr_dtb = {
>      },
>  };
>  
> +static bool spapr_fwnmi_needed(void *opaque)
> +{
> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> +
> +    return spapr->guest_machine_check_addr != -1;
> +}
> +
> +static int spapr_fwnmi_pre_save(void *opaque)
> +{
> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
> +
> +    /*
> +     * Check if machine check handling is in progress and print a
> +     * warning message.
> +     */
> +    if (spapr->mc_status != -1) {
> +        warn_report("A machine check is being handled during migration. The"
> +                "handler may run and log hardware error on the destination");
> +    }
> +
> +    return 0;
> +}
> +
> +static const VMStateDescription vmstate_spapr_machine_check = {
> +    .name = "spapr_machine_check",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = spapr_fwnmi_needed,
> +    .pre_save = spapr_fwnmi_pre_save,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
> +        VMSTATE_INT32(mc_status, SpaprMachineState),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static const VMStateDescription vmstate_spapr = {
>      .name = "spapr",
>      .version_id = 3,
> @@ -1999,6 +2038,7 @@ static const VMStateDescription vmstate_spapr = {
>          &vmstate_spapr_cap_large_decr,
>          &vmstate_spapr_cap_ccf_assist,
>          &vmstate_spapr_cap_fwnmi,
> +        &vmstate_spapr_machine_check,
>          NULL
>      }
>  };
> @@ -2814,6 +2854,13 @@ static void spapr_machine_init(MachineState *machine)
>          spapr_create_lmb_dr_connectors(spapr);
>      }
>  
> +    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI_MCE) == SPAPR_CAP_ON) {
> +        /* Create the error string for live migration blocker */
> +        error_setg(&spapr->fwnmi_migration_blocker,
> +            "A machine check is being handled during migration. The handler"
> +            "may run and log hardware error on the destination");
> +    }
> +
>      /* Set up RTAS event infrastructure */
>      spapr_events_init(spapr);
>  
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index 54eaf28a9e..884e455f02 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -43,6 +43,7 @@
>  #include "qemu/main-loop.h"
>  #include "hw/ppc/spapr_ovec.h"
>  #include <libfdt.h>
> +#include "migration/blocker.h"
>  
>  #define RTAS_LOG_VERSION_MASK                   0xff000000
>  #define   RTAS_LOG_VERSION_6                    0x06000000
> @@ -843,6 +844,8 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>  {
>      SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>      CPUState *cs = CPU(cpu);
> +    int ret;
> +    Error *local_err = NULL;
>  
>      if (spapr->guest_machine_check_addr == -1) {
>          /*
> @@ -872,8 +875,19 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>              return;
>          }
>      }
> -    spapr->mc_status = cpu->vcpu_id;
>  
> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
> +    if (ret == -EBUSY) {
> +        /*
> +         * We don't want to abort so we let the migration to continue.
> +         * In a rare case, the machine check handler will run on the target.
> +         * Though this is not preferable, it is better than aborting
> +         * the migration or killing the VM.
> +         */
> +        warn_report("Received a fwnmi while migration was in progress");
> +    }
> +
> +    spapr->mc_status = cpu->vcpu_id;
>      spapr_mce_dispatch_elog(cpu, recovered);
>  }
>  
> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
> index 3f162d82f5..4ce8e48d2a 100644
> --- a/hw/ppc/spapr_rtas.c
> +++ b/hw/ppc/spapr_rtas.c
> @@ -50,6 +50,7 @@
>  #include "hw/ppc/fdt.h"
>  #include "target/ppc/mmu-hash64.h"
>  #include "target/ppc/mmu-book3s-v3.h"
> +#include "migration/blocker.h"
>  
>  static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
>                                     uint32_t token, uint32_t nargs,
> @@ -453,6 +454,7 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>      spapr->mc_status = -1;
>      qemu_cond_signal(&spapr->mc_delivery_cond);
>      rtas_st(rets, 0, RTAS_OUT_SUCCESS);
> +    migrate_del_blocker(spapr->fwnmi_migration_blocker);
>  }
>  
>  static struct rtas_call {
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index f6f82d88aa..a1fba95c82 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -218,6 +218,8 @@ struct SpaprMachineState {
>  
>      unsigned gpu_numa_id;
>      SpaprTpmProxy *tpm_proxy;
> +
> +    Error *fwnmi_migration_blocker;
>  };
>  
>  #define H_SUCCESS         0
Ganesh Goudar Jan. 20, 2020, 5:31 p.m. UTC | #2
On 1/20/20 6:18 AM, David Gibson wrote:
> On Fri, Jan 17, 2020 at 03:08:54PM +0530, Ganesh Goudar wrote:
>> From: Aravinda Prasad <arawinda.p@gmail.com>
>>
>> This patch includes migration support for machine check
>> handling. Especially this patch blocks VM migration
>> requests until the machine check error handling is
>> complete as these errors are specific to the source
>> hardware and is irrelevant on the target hardware.
>>
>> Signed-off-by: Aravinda Prasad <arawinda.p@gmail.com>
>> [Do not set FWNMI cap in post_load, now its done in .apply hook]
>> Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
>> ---
>>   hw/ppc/spapr.c         | 47 ++++++++++++++++++++++++++++++++++++++++++
>>   hw/ppc/spapr_events.c  | 16 +++++++++++++-
>>   hw/ppc/spapr_rtas.c    |  2 ++
>>   include/hw/ppc/spapr.h |  2 ++
>>   4 files changed, 66 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>> index 46bac1a83c..c8bc2fa9f3 100644
>> --- a/hw/ppc/spapr.c
>> +++ b/hw/ppc/spapr.c
>> @@ -46,6 +46,7 @@
>>   #include "migration/qemu-file-types.h"
>>   #include "migration/global_state.h"
>>   #include "migration/register.h"
>> +#include "migration/blocker.h"
>>   #include "mmu-hash64.h"
>>   #include "mmu-book3s-v3.h"
>>   #include "cpu-models.h"
>> @@ -1683,6 +1684,8 @@ static void spapr_machine_reset(MachineState *machine)
>>   
>>       /* Signal all vCPUs waiting on this condition */
>>       qemu_cond_broadcast(&spapr->mc_delivery_cond);
>> +
>> +    migrate_del_blocker(spapr->fwnmi_migration_blocker);
> Thinking on our discussions about this earlier, there are
> circumstances where we could add the blocker message multiple times.
> IIUC, this will just remove one of them, but at reset, we need to
> remove all of them.
Sorry I did not mention it in our previous conversation, here machine 
check events
are serialized, Aravinda has explained it clear in commit message of 
patch 5/7, So
there will be a single entry in the list.
>
>>   }
>>   
>>   static void spapr_create_nvram(SpaprMachineState *spapr)
>> @@ -1965,6 +1968,42 @@ static const VMStateDescription vmstate_spapr_dtb = {
>>       },
>>   };
>>   
>> +static bool spapr_fwnmi_needed(void *opaque)
>> +{
>> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
>> +
>> +    return spapr->guest_machine_check_addr != -1;
>> +}
>> +
>> +static int spapr_fwnmi_pre_save(void *opaque)
>> +{
>> +    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
>> +
>> +    /*
>> +     * Check if machine check handling is in progress and print a
>> +     * warning message.
>> +     */
>> +    if (spapr->mc_status != -1) {
>> +        warn_report("A machine check is being handled during migration. The"
>> +                "handler may run and log hardware error on the destination");
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static const VMStateDescription vmstate_spapr_machine_check = {
>> +    .name = "spapr_machine_check",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .needed = spapr_fwnmi_needed,
>> +    .pre_save = spapr_fwnmi_pre_save,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
>> +        VMSTATE_INT32(mc_status, SpaprMachineState),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>>   static const VMStateDescription vmstate_spapr = {
>>       .name = "spapr",
>>       .version_id = 3,
>> @@ -1999,6 +2038,7 @@ static const VMStateDescription vmstate_spapr = {
>>           &vmstate_spapr_cap_large_decr,
>>           &vmstate_spapr_cap_ccf_assist,
>>           &vmstate_spapr_cap_fwnmi,
>> +        &vmstate_spapr_machine_check,
>>           NULL
>>       }
>>   };
>> @@ -2814,6 +2854,13 @@ static void spapr_machine_init(MachineState *machine)
>>           spapr_create_lmb_dr_connectors(spapr);
>>       }
>>   
>> +    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI_MCE) == SPAPR_CAP_ON) {
>> +        /* Create the error string for live migration blocker */
>> +        error_setg(&spapr->fwnmi_migration_blocker,
>> +            "A machine check is being handled during migration. The handler"
>> +            "may run and log hardware error on the destination");
>> +    }
>> +
>>       /* Set up RTAS event infrastructure */
>>       spapr_events_init(spapr);
>>   
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index 54eaf28a9e..884e455f02 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -43,6 +43,7 @@
>>   #include "qemu/main-loop.h"
>>   #include "hw/ppc/spapr_ovec.h"
>>   #include <libfdt.h>
>> +#include "migration/blocker.h"
>>   
>>   #define RTAS_LOG_VERSION_MASK                   0xff000000
>>   #define   RTAS_LOG_VERSION_6                    0x06000000
>> @@ -843,6 +844,8 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>   {
>>       SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>       CPUState *cs = CPU(cpu);
>> +    int ret;
>> +    Error *local_err = NULL;
>>   
>>       if (spapr->guest_machine_check_addr == -1) {
>>           /*
>> @@ -872,8 +875,19 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>>               return;
>>           }
>>       }
>> -    spapr->mc_status = cpu->vcpu_id;
>>   
>> +    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
>> +    if (ret == -EBUSY) {
>> +        /*
>> +         * We don't want to abort so we let the migration to continue.
>> +         * In a rare case, the machine check handler will run on the target.
>> +         * Though this is not preferable, it is better than aborting
>> +         * the migration or killing the VM.
>> +         */
>> +        warn_report("Received a fwnmi while migration was in progress");
>> +    }
>> +
>> +    spapr->mc_status = cpu->vcpu_id;
>>       spapr_mce_dispatch_elog(cpu, recovered);
>>   }
>>   
>> diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
>> index 3f162d82f5..4ce8e48d2a 100644
>> --- a/hw/ppc/spapr_rtas.c
>> +++ b/hw/ppc/spapr_rtas.c
>> @@ -50,6 +50,7 @@
>>   #include "hw/ppc/fdt.h"
>>   #include "target/ppc/mmu-hash64.h"
>>   #include "target/ppc/mmu-book3s-v3.h"
>> +#include "migration/blocker.h"
>>   
>>   static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
>>                                      uint32_t token, uint32_t nargs,
>> @@ -453,6 +454,7 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
>>       spapr->mc_status = -1;
>>       qemu_cond_signal(&spapr->mc_delivery_cond);
>>       rtas_st(rets, 0, RTAS_OUT_SUCCESS);
>> +    migrate_del_blocker(spapr->fwnmi_migration_blocker);
>>   }
>>   
>>   static struct rtas_call {
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index f6f82d88aa..a1fba95c82 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -218,6 +218,8 @@ struct SpaprMachineState {
>>   
>>       unsigned gpu_numa_id;
>>       SpaprTpmProxy *tpm_proxy;
>> +
>> +    Error *fwnmi_migration_blocker;
>>   };
>>   
>>   #define H_SUCCESS         0
diff mbox series

Patch

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 46bac1a83c..c8bc2fa9f3 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -46,6 +46,7 @@ 
 #include "migration/qemu-file-types.h"
 #include "migration/global_state.h"
 #include "migration/register.h"
+#include "migration/blocker.h"
 #include "mmu-hash64.h"
 #include "mmu-book3s-v3.h"
 #include "cpu-models.h"
@@ -1683,6 +1684,8 @@  static void spapr_machine_reset(MachineState *machine)
 
     /* Signal all vCPUs waiting on this condition */
     qemu_cond_broadcast(&spapr->mc_delivery_cond);
+
+    migrate_del_blocker(spapr->fwnmi_migration_blocker);
 }
 
 static void spapr_create_nvram(SpaprMachineState *spapr)
@@ -1965,6 +1968,42 @@  static const VMStateDescription vmstate_spapr_dtb = {
     },
 };
 
+static bool spapr_fwnmi_needed(void *opaque)
+{
+    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+    return spapr->guest_machine_check_addr != -1;
+}
+
+static int spapr_fwnmi_pre_save(void *opaque)
+{
+    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+    /*
+     * Check if machine check handling is in progress and print a
+     * warning message.
+     */
+    if (spapr->mc_status != -1) {
+        warn_report("A machine check is being handled during migration. The"
+                "handler may run and log hardware error on the destination");
+    }
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_spapr_machine_check = {
+    .name = "spapr_machine_check",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = spapr_fwnmi_needed,
+    .pre_save = spapr_fwnmi_pre_save,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
+        VMSTATE_INT32(mc_status, SpaprMachineState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
 static const VMStateDescription vmstate_spapr = {
     .name = "spapr",
     .version_id = 3,
@@ -1999,6 +2038,7 @@  static const VMStateDescription vmstate_spapr = {
         &vmstate_spapr_cap_large_decr,
         &vmstate_spapr_cap_ccf_assist,
         &vmstate_spapr_cap_fwnmi,
+        &vmstate_spapr_machine_check,
         NULL
     }
 };
@@ -2814,6 +2854,13 @@  static void spapr_machine_init(MachineState *machine)
         spapr_create_lmb_dr_connectors(spapr);
     }
 
+    if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI_MCE) == SPAPR_CAP_ON) {
+        /* Create the error string for live migration blocker */
+        error_setg(&spapr->fwnmi_migration_blocker,
+            "A machine check is being handled during migration. The handler"
+            "may run and log hardware error on the destination");
+    }
+
     /* Set up RTAS event infrastructure */
     spapr_events_init(spapr);
 
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index 54eaf28a9e..884e455f02 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -43,6 +43,7 @@ 
 #include "qemu/main-loop.h"
 #include "hw/ppc/spapr_ovec.h"
 #include <libfdt.h>
+#include "migration/blocker.h"
 
 #define RTAS_LOG_VERSION_MASK                   0xff000000
 #define   RTAS_LOG_VERSION_6                    0x06000000
@@ -843,6 +844,8 @@  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
 {
     SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
     CPUState *cs = CPU(cpu);
+    int ret;
+    Error *local_err = NULL;
 
     if (spapr->guest_machine_check_addr == -1) {
         /*
@@ -872,8 +875,19 @@  void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
             return;
         }
     }
-    spapr->mc_status = cpu->vcpu_id;
 
+    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
+    if (ret == -EBUSY) {
+        /*
+         * We don't want to abort so we let the migration to continue.
+         * In a rare case, the machine check handler will run on the target.
+         * Though this is not preferable, it is better than aborting
+         * the migration or killing the VM.
+         */
+        warn_report("Received a fwnmi while migration was in progress");
+    }
+
+    spapr->mc_status = cpu->vcpu_id;
     spapr_mce_dispatch_elog(cpu, recovered);
 }
 
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index 3f162d82f5..4ce8e48d2a 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -50,6 +50,7 @@ 
 #include "hw/ppc/fdt.h"
 #include "target/ppc/mmu-hash64.h"
 #include "target/ppc/mmu-book3s-v3.h"
+#include "migration/blocker.h"
 
 static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
                                    uint32_t token, uint32_t nargs,
@@ -453,6 +454,7 @@  static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
     spapr->mc_status = -1;
     qemu_cond_signal(&spapr->mc_delivery_cond);
     rtas_st(rets, 0, RTAS_OUT_SUCCESS);
+    migrate_del_blocker(spapr->fwnmi_migration_blocker);
 }
 
 static struct rtas_call {
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index f6f82d88aa..a1fba95c82 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -218,6 +218,8 @@  struct SpaprMachineState {
 
     unsigned gpu_numa_id;
     SpaprTpmProxy *tpm_proxy;
+
+    Error *fwnmi_migration_blocker;
 };
 
 #define H_SUCCESS         0