diff mbox series

[v3,3/3] powernv/kdump: Fix cases where the kdump kernel can get HMI's

Message ID 20171215012740.30291-3-bsingharora@gmail.com (mailing list archive)
State Superseded
Headers show
Series [v3,1/3] powerpc/crash: Remove the test for cpu_online in the IPI callback | expand

Commit Message

Balbir Singh Dec. 15, 2017, 1:27 a.m. UTC
Certain HMI's such as malfunction error propagate through
all threads/core on the system. If a thread was offline
prior to us crashing the system and jumping to the kdump
kernel, bad things happen when it wakes up due to an HMI
in the kdump kernel.

There are several possible ways to solve this problem

1. Put the offline cores in a state such that they are
not woken up for machine check and HMI errors. This
does not work, since we might need to wake up offline
threads to handle TB errors
2. Ignore HMI errors, setup HMEER to mask HMI errors,
but this still leads the window open for any MCEs
and masking them for the duration of the dump might
be a concern
3. Wake up offline CPUs, as in send them to
crash_ipi_callback (not wake them up as in mark them
online as seen by the hotplug). kexec does a
wake_online_cpus() call, this patch does something
similar, but instead sends an IPI and forces them to
crash_ipi_callback()

This patch takes approach #3.

Care is taken to enable this only for powenv platforms
via crash_wake_offline (a global value set at setup
time). The crash code sends out IPI's to all CPU's
which then move to crash_ipi_callback and kexec_smp_wait().

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
---

Changelog v3
 - Use SRR1's reason to wake up to drive replay_system_reset()
   as a means of getting to kdump() as opposed to calling
   crash_ipi_callback based on comments from Nick Piggin.

 arch/powerpc/include/asm/kexec.h     |  2 ++
 arch/powerpc/kernel/crash.c          | 13 ++++++++++++-
 arch/powerpc/kernel/smp.c            | 18 ++++++++++++++++++
 arch/powerpc/platforms/powernv/smp.c | 12 ++++++++++++
 4 files changed, 44 insertions(+), 1 deletion(-)

Comments

Nicholas Piggin Dec. 15, 2017, 3:10 a.m. UTC | #1
On Fri, 15 Dec 2017 12:27:40 +1100
Balbir Singh <bsingharora@gmail.com> wrote:

> Certain HMI's such as malfunction error propagate through
> all threads/core on the system. If a thread was offline
> prior to us crashing the system and jumping to the kdump
> kernel, bad things happen when it wakes up due to an HMI
> in the kdump kernel.
> 
> There are several possible ways to solve this problem
> 
> 1. Put the offline cores in a state such that they are
> not woken up for machine check and HMI errors. This
> does not work, since we might need to wake up offline
> threads to handle TB errors
> 2. Ignore HMI errors, setup HMEER to mask HMI errors,
> but this still leads the window open for any MCEs
> and masking them for the duration of the dump might
> be a concern
> 3. Wake up offline CPUs, as in send them to
> crash_ipi_callback (not wake them up as in mark them
> online as seen by the hotplug). kexec does a
> wake_online_cpus() call, this patch does something
> similar, but instead sends an IPI and forces them to
> crash_ipi_callback()
> 
> This patch takes approach #3.
> 
> Care is taken to enable this only for powenv platforms
> via crash_wake_offline (a global value set at setup
> time). The crash code sends out IPI's to all CPU's
> which then move to crash_ipi_callback and kexec_smp_wait().
> 
> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
> ---
> 
> Changelog v3
>  - Use SRR1's reason to wake up to drive replay_system_reset()
>    as a means of getting to kdump() as opposed to calling
>    crash_ipi_callback based on comments from Nick Piggin.
> 
>  arch/powerpc/include/asm/kexec.h     |  2 ++
>  arch/powerpc/kernel/crash.c          | 13 ++++++++++++-
>  arch/powerpc/kernel/smp.c            | 18 ++++++++++++++++++
>  arch/powerpc/platforms/powernv/smp.c | 12 ++++++++++++
>  4 files changed, 44 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
> index 4419d435639a..9dcbfa6bbb91 100644
> --- a/arch/powerpc/include/asm/kexec.h
> +++ b/arch/powerpc/include/asm/kexec.h
> @@ -73,6 +73,8 @@ extern void kexec_smp_wait(void);	/* get and clear naca physid, wait for
>  					  master to copy new code to 0 */
>  extern int crashing_cpu;
>  extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *));
> +extern void crash_ipi_callback(struct pt_regs *);
> +extern int crash_wake_offline;
>  
>  struct kimage;
>  struct pt_regs;
> diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
> index 29c56ca2ddfd..00b215125d3e 100644
> --- a/arch/powerpc/kernel/crash.c
> +++ b/arch/powerpc/kernel/crash.c
> @@ -44,6 +44,14 @@
>  #define REAL_MODE_TIMEOUT	10000
>  
>  static int time_to_dump;
> +/*
> + * crash_wake_offline should be set to 1 by platforms that intend to wake
> + * up offline cpus prior to jumping to a kdump kernel. Currently powernv
> + * sets it to 1, since we want to avoid things from happening when an
> + * offline CPU wakes up due to something like an HMI (malfunction error),
> + * which propagates to all threads.
> + */
> +int crash_wake_offline;
>  
>  #define CRASH_HANDLER_MAX 3
>  /* List of shutdown handles */
> @@ -63,7 +71,7 @@ static int handle_fault(struct pt_regs *regs)
>  #ifdef CONFIG_SMP
>  
>  static atomic_t cpus_in_crash;
> -static void crash_ipi_callback(struct pt_regs *regs)
> +void crash_ipi_callback(struct pt_regs *regs)
>  {
>  	static cpumask_t cpus_state_saved = CPU_MASK_NONE;
>  
> @@ -106,6 +114,9 @@ static void crash_kexec_prepare_cpus(int cpu)
>  
>  	printk(KERN_EMERG "Sending IPI to other CPUs\n");
>  
> +	if (crash_wake_offline)
> +		ncpus = num_present_cpus() - 1;
> +
>  	crash_send_ipi(crash_ipi_callback);
>  	smp_wmb();
>  
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index e0a4c1f82e25..bbe7634b3a43 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -543,7 +543,25 @@ void smp_send_debugger_break(void)
>  #ifdef CONFIG_KEXEC_CORE
>  void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
>  {
> +	int cpu;
> +
>  	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
> +	if (kdump_in_progress() && crash_wake_offline) {
> +		for_each_present_cpu(cpu) {
> +			if (cpu_online(cpu))
> +				continue;
> +			/*
> +			 * crash_ipi_callback will wait for
> +			 * all cpus, including offline CPUs.
> +			 * We don't care about nmi_ipi_function.
> +			 * Offline cpus will jump straight into
> +			 * crash_ipi_callback, we can skip the
> +			 * entire NMI dance and waiting for
> +			 * cpus to clear pending mask, etc.
> +			 */
> +			do_smp_send_nmi_ipi(cpu);
> +		}
> +	}
>  }
>  #endif
>  
> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
> index ba030669eca1..1594bbff3dec 100644
> --- a/arch/powerpc/platforms/powernv/smp.c
> +++ b/arch/powerpc/platforms/powernv/smp.c
> @@ -37,6 +37,8 @@
>  #include <asm/kvm_ppc.h>
>  #include <asm/ppc-opcode.h>
>  #include <asm/cpuidle.h>
> +#include <asm/kexec.h>
> +#include <asm/reg.h>
>  
>  #include "powernv.h"
>  
> @@ -212,6 +214,13 @@ static void pnv_smp_cpu_kill_self(void)
>  		}
>  		smp_mb();
>  
> +		/*
> +		 * For kdump kernels, we process the ipi and jump to
> +		 * handling the system reset exception.
> +		 */
> +		if (kdump_in_progress())
> +			irq_set_pending_from_srr1(srr1);
> +
>  		if (cpu_core_split_required())
>  			continue;
>  

I wonder if we want to special-case it for system reset only? Everything
is all going down for kdump anyway I guess, but theoretically we don't
want to put other interrupts in our pending mask.

We might as well just do it unconditionally as well, so we can easily test
and make sure we do something sane if we happen to take a stray sreset
here for some reason (e.g., pdbg).

I would do this:

 } else if ((srr1 & wmask) == SRR1_WAKERESET) {
     /* kdump or debug tools can sreset offline CPUs */
     irq_set_pending_from_srr1(srr1);
 }

But then you still need an explicit kdump check for non-sreset wakeups
because the platform may not implement sreset, or it may fall back to
normal IPI if the sreset scom fails. So you then still need your

if (kdump) crash_ipi_callback 

Thanks,
Nick
Balbir Singh Dec. 15, 2017, 3:34 a.m. UTC | #2
On Fri, Dec 15, 2017 at 2:10 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Fri, 15 Dec 2017 12:27:40 +1100
> Balbir Singh <bsingharora@gmail.com> wrote:
>
>> Certain HMI's such as malfunction error propagate through
>> all threads/core on the system. If a thread was offline
>> prior to us crashing the system and jumping to the kdump
>> kernel, bad things happen when it wakes up due to an HMI
>> in the kdump kernel.
>>
>> There are several possible ways to solve this problem
>>
>> 1. Put the offline cores in a state such that they are
>> not woken up for machine check and HMI errors. This
>> does not work, since we might need to wake up offline
>> threads to handle TB errors
>> 2. Ignore HMI errors, setup HMEER to mask HMI errors,
>> but this still leads the window open for any MCEs
>> and masking them for the duration of the dump might
>> be a concern
>> 3. Wake up offline CPUs, as in send them to
>> crash_ipi_callback (not wake them up as in mark them
>> online as seen by the hotplug). kexec does a
>> wake_online_cpus() call, this patch does something
>> similar, but instead sends an IPI and forces them to
>> crash_ipi_callback()
>>
>> This patch takes approach #3.
>>
>> Care is taken to enable this only for powenv platforms
>> via crash_wake_offline (a global value set at setup
>> time). The crash code sends out IPI's to all CPU's
>> which then move to crash_ipi_callback and kexec_smp_wait().
>>
>> Signed-off-by: Balbir Singh <bsingharora@gmail.com>
>> ---
>>
>> Changelog v3
>>  - Use SRR1's reason to wake up to drive replay_system_reset()
>>    as a means of getting to kdump() as opposed to calling
>>    crash_ipi_callback based on comments from Nick Piggin.
>>
>>  arch/powerpc/include/asm/kexec.h     |  2 ++
>>  arch/powerpc/kernel/crash.c          | 13 ++++++++++++-
>>  arch/powerpc/kernel/smp.c            | 18 ++++++++++++++++++
>>  arch/powerpc/platforms/powernv/smp.c | 12 ++++++++++++
>>  4 files changed, 44 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
>> index 4419d435639a..9dcbfa6bbb91 100644
>> --- a/arch/powerpc/include/asm/kexec.h
>> +++ b/arch/powerpc/include/asm/kexec.h
>> @@ -73,6 +73,8 @@ extern void kexec_smp_wait(void);   /* get and clear naca physid, wait for
>>                                         master to copy new code to 0 */
>>  extern int crashing_cpu;
>>  extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *));
>> +extern void crash_ipi_callback(struct pt_regs *);
>> +extern int crash_wake_offline;
>>
>>  struct kimage;
>>  struct pt_regs;
>> diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
>> index 29c56ca2ddfd..00b215125d3e 100644
>> --- a/arch/powerpc/kernel/crash.c
>> +++ b/arch/powerpc/kernel/crash.c
>> @@ -44,6 +44,14 @@
>>  #define REAL_MODE_TIMEOUT    10000
>>
>>  static int time_to_dump;
>> +/*
>> + * crash_wake_offline should be set to 1 by platforms that intend to wake
>> + * up offline cpus prior to jumping to a kdump kernel. Currently powernv
>> + * sets it to 1, since we want to avoid things from happening when an
>> + * offline CPU wakes up due to something like an HMI (malfunction error),
>> + * which propagates to all threads.
>> + */
>> +int crash_wake_offline;
>>
>>  #define CRASH_HANDLER_MAX 3
>>  /* List of shutdown handles */
>> @@ -63,7 +71,7 @@ static int handle_fault(struct pt_regs *regs)
>>  #ifdef CONFIG_SMP
>>
>>  static atomic_t cpus_in_crash;
>> -static void crash_ipi_callback(struct pt_regs *regs)
>> +void crash_ipi_callback(struct pt_regs *regs)
>>  {
>>       static cpumask_t cpus_state_saved = CPU_MASK_NONE;
>>
>> @@ -106,6 +114,9 @@ static void crash_kexec_prepare_cpus(int cpu)
>>
>>       printk(KERN_EMERG "Sending IPI to other CPUs\n");
>>
>> +     if (crash_wake_offline)
>> +             ncpus = num_present_cpus() - 1;
>> +
>>       crash_send_ipi(crash_ipi_callback);
>>       smp_wmb();
>>
>> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
>> index e0a4c1f82e25..bbe7634b3a43 100644
>> --- a/arch/powerpc/kernel/smp.c
>> +++ b/arch/powerpc/kernel/smp.c
>> @@ -543,7 +543,25 @@ void smp_send_debugger_break(void)
>>  #ifdef CONFIG_KEXEC_CORE
>>  void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
>>  {
>> +     int cpu;
>> +
>>       smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
>> +     if (kdump_in_progress() && crash_wake_offline) {
>> +             for_each_present_cpu(cpu) {
>> +                     if (cpu_online(cpu))
>> +                             continue;
>> +                     /*
>> +                      * crash_ipi_callback will wait for
>> +                      * all cpus, including offline CPUs.
>> +                      * We don't care about nmi_ipi_function.
>> +                      * Offline cpus will jump straight into
>> +                      * crash_ipi_callback, we can skip the
>> +                      * entire NMI dance and waiting for
>> +                      * cpus to clear pending mask, etc.
>> +                      */
>> +                     do_smp_send_nmi_ipi(cpu);
>> +             }
>> +     }
>>  }
>>  #endif
>>
>> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
>> index ba030669eca1..1594bbff3dec 100644
>> --- a/arch/powerpc/platforms/powernv/smp.c
>> +++ b/arch/powerpc/platforms/powernv/smp.c
>> @@ -37,6 +37,8 @@
>>  #include <asm/kvm_ppc.h>
>>  #include <asm/ppc-opcode.h>
>>  #include <asm/cpuidle.h>
>> +#include <asm/kexec.h>
>> +#include <asm/reg.h>
>>
>>  #include "powernv.h"
>>
>> @@ -212,6 +214,13 @@ static void pnv_smp_cpu_kill_self(void)
>>               }
>>               smp_mb();
>>
>> +             /*
>> +              * For kdump kernels, we process the ipi and jump to
>> +              * handling the system reset exception.
>> +              */
>> +             if (kdump_in_progress())
>> +                     irq_set_pending_from_srr1(srr1);
>> +
>>               if (cpu_core_split_required())
>>                       continue;
>>
>
> I wonder if we want to special-case it for system reset only? Everything
> is all going down for kdump anyway I guess, but theoretically we don't
> want to put other interrupts in our pending mask.
>
> We might as well just do it unconditionally as well, so we can easily test
> and make sure we do something sane if we happen to take a stray sreset
> here for some reason (e.g., pdbg).
>
> I would do this:
>
>  } else if ((srr1 & wmask) == SRR1_WAKERESET) {
>      /* kdump or debug tools can sreset offline CPUs */
>      irq_set_pending_from_srr1(srr1);
>  }
>
> But then you still need an explicit kdump check for non-sreset wakeups
> because the platform may not implement sreset, or it may fall back to
> normal IPI if the sreset scom fails. So you then still need your
>
> if (kdump) crash_ipi_callback

Yep, its required for the the non NMI case.


How does this look -- based on your comment

+               } else if ((srr1 & wmask) == SRR1_WAKESRESET) {
+                       irq_set_pending_from_srr1(srr1);
+                       /* Does not return */
                }
+
                smp_mb();

                /*
                 * For kdump kernels, we process the ipi and jump to
-                * handling the system reset exception.
+                * crash_ipi_callback
                 */
-               if (kdump_in_progress())
-                       irq_set_pending_from_srr1(srr1);
+               if (kdump_in_progress()) {
+                       /*
+                        * If we got to this point, we've not used
+                        * NMI's, otherwise we would have gone
+                        * via the SRR1_WAKESRESET path. We are
+                        * using regular IPI's for waking up offline
+                        * threads.
+                        */
+                       struct pt_regs regs;
+
+                       ppc_save_regs(&regs);
+                       crash_ipi_callback(regs);
+                       /* Does not return */
+               }

Balbir Singh

>
> Thanks,
> Nick
Nicholas Piggin Dec. 15, 2017, 4:47 a.m. UTC | #3
On Fri, 15 Dec 2017 14:34:03 +1100
Balbir Singh <bsingharora@gmail.com> wrote:

> On Fri, Dec 15, 2017 at 2:10 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> > On Fri, 15 Dec 2017 12:27:40 +1100
> > Balbir Singh <bsingharora@gmail.com> wrote:

> > But then you still need an explicit kdump check for non-sreset wakeups
> > because the platform may not implement sreset, or it may fall back to
> > normal IPI if the sreset scom fails. So you then still need your
> >
> > if (kdump) crash_ipi_callback  
> 
> Yep, its required for the the non NMI case.
> 
> 
> How does this look -- based on your comment
> 
> +               } else if ((srr1 & wmask) == SRR1_WAKESRESET) {
> +                       irq_set_pending_from_srr1(srr1);
> +                       /* Does not return */
>                 }
> +
>                 smp_mb();
> 
>                 /*
>                  * For kdump kernels, we process the ipi and jump to
> -                * handling the system reset exception.
> +                * crash_ipi_callback
>                  */
> -               if (kdump_in_progress())
> -                       irq_set_pending_from_srr1(srr1);
> +               if (kdump_in_progress()) {
> +                       /*
> +                        * If we got to this point, we've not used
> +                        * NMI's, otherwise we would have gone
> +                        * via the SRR1_WAKESRESET path. We are
> +                        * using regular IPI's for waking up offline
> +                        * threads.
> +                        */
> +                       struct pt_regs regs;
> +
> +                       ppc_save_regs(&regs);
> +                       crash_ipi_callback(regs);
> +                       /* Does not return */
> +               }

That looks like it should do the trick.

Thanks,
Nick
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index 4419d435639a..9dcbfa6bbb91 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -73,6 +73,8 @@  extern void kexec_smp_wait(void);	/* get and clear naca physid, wait for
 					  master to copy new code to 0 */
 extern int crashing_cpu;
 extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *));
+extern void crash_ipi_callback(struct pt_regs *);
+extern int crash_wake_offline;
 
 struct kimage;
 struct pt_regs;
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index 29c56ca2ddfd..00b215125d3e 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -44,6 +44,14 @@ 
 #define REAL_MODE_TIMEOUT	10000
 
 static int time_to_dump;
+/*
+ * crash_wake_offline should be set to 1 by platforms that intend to wake
+ * up offline cpus prior to jumping to a kdump kernel. Currently powernv
+ * sets it to 1, since we want to avoid things from happening when an
+ * offline CPU wakes up due to something like an HMI (malfunction error),
+ * which propagates to all threads.
+ */
+int crash_wake_offline;
 
 #define CRASH_HANDLER_MAX 3
 /* List of shutdown handles */
@@ -63,7 +71,7 @@  static int handle_fault(struct pt_regs *regs)
 #ifdef CONFIG_SMP
 
 static atomic_t cpus_in_crash;
-static void crash_ipi_callback(struct pt_regs *regs)
+void crash_ipi_callback(struct pt_regs *regs)
 {
 	static cpumask_t cpus_state_saved = CPU_MASK_NONE;
 
@@ -106,6 +114,9 @@  static void crash_kexec_prepare_cpus(int cpu)
 
 	printk(KERN_EMERG "Sending IPI to other CPUs\n");
 
+	if (crash_wake_offline)
+		ncpus = num_present_cpus() - 1;
+
 	crash_send_ipi(crash_ipi_callback);
 	smp_wmb();
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index e0a4c1f82e25..bbe7634b3a43 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -543,7 +543,25 @@  void smp_send_debugger_break(void)
 #ifdef CONFIG_KEXEC_CORE
 void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
 {
+	int cpu;
+
 	smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
+	if (kdump_in_progress() && crash_wake_offline) {
+		for_each_present_cpu(cpu) {
+			if (cpu_online(cpu))
+				continue;
+			/*
+			 * crash_ipi_callback will wait for
+			 * all cpus, including offline CPUs.
+			 * We don't care about nmi_ipi_function.
+			 * Offline cpus will jump straight into
+			 * crash_ipi_callback, we can skip the
+			 * entire NMI dance and waiting for
+			 * cpus to clear pending mask, etc.
+			 */
+			do_smp_send_nmi_ipi(cpu);
+		}
+	}
 }
 #endif
 
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index ba030669eca1..1594bbff3dec 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -37,6 +37,8 @@ 
 #include <asm/kvm_ppc.h>
 #include <asm/ppc-opcode.h>
 #include <asm/cpuidle.h>
+#include <asm/kexec.h>
+#include <asm/reg.h>
 
 #include "powernv.h"
 
@@ -212,6 +214,13 @@  static void pnv_smp_cpu_kill_self(void)
 		}
 		smp_mb();
 
+		/*
+		 * For kdump kernels, we process the ipi and jump to
+		 * handling the system reset exception.
+		 */
+		if (kdump_in_progress())
+			irq_set_pending_from_srr1(srr1);
+
 		if (cpu_core_split_required())
 			continue;
 
@@ -371,5 +380,8 @@  void __init pnv_smp_init(void)
 
 #ifdef CONFIG_HOTPLUG_CPU
 	ppc_md.cpu_die	= pnv_smp_cpu_kill_self;
+#ifdef CONFIG_KEXEC_CORE
+	crash_wake_offline = 1;
+#endif
 #endif
 }