[RFC,2/2] powerpc/powernv: implement NMI IPIs with OPAL_SIGNAL_SYSTEM_RESET

Message ID 20170912160553.13422-3-npiggin@gmail.com
State Superseded
Headers show
Series
  • NMI IPI work in progress for Linux and OPAL
Related show

Commit Message

Nicholas Piggin Sept. 12, 2017, 4:05 p.m.
There are two complications. The first is that sreset from stop states
come in with SRR1 set to do a powersave wakeup, with an sreset reason
encoded.

The second is that threads on the same core can't be signalled directly
so we must designate a bounce CPU to reflect the IPI back.
---
 arch/powerpc/include/asm/opal-api.h            |   1 +
 arch/powerpc/include/asm/opal.h                |   2 +
 arch/powerpc/kernel/irq.c                      |  13 +++
 arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
 arch/powerpc/platforms/powernv/powernv.h       |   1 +
 arch/powerpc/platforms/powernv/setup.c         |   3 +
 arch/powerpc/platforms/powernv/smp.c           | 111 +++++++++++++++++++++++++
 7 files changed, 132 insertions(+)

Comments

Nicholas Piggin Sept. 13, 2017, 1:13 p.m. | #1
On Wed, 13 Sep 2017 02:05:53 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> There are two complications. The first is that sreset from stop states
> come in with SRR1 set to do a powersave wakeup, with an sreset reason
> encoded.
> 
> The second is that threads on the same core can't be signalled directly
> so we must designate a bounce CPU to reflect the IPI back.

Here is an updated Linux patch for the latest OPAL patch. This has
a few assorted fixes as well to make it work nicely, I roll them into
one patch here to make it easy to apply for testing the OPAL patch.

Thanks,
Nick

---
 arch/powerpc/include/asm/opal-api.h            |  1 +
 arch/powerpc/include/asm/opal.h                |  2 +
 arch/powerpc/kernel/irq.c                      | 18 ++++++
 arch/powerpc/kernel/watchdog.c                 | 30 +++++++--
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/powernv.h       |  1 +
 arch/powerpc/platforms/powernv/setup.c         |  3 +
 arch/powerpc/platforms/powernv/smp.c           | 89 ++++++++++++++++++++++++++
 arch/powerpc/xmon/xmon.c                       | 17 +++--
 9 files changed, 151 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 450a60b81d2a..e39f4236b413 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -188,6 +188,7 @@
 #define OPAL_XIVE_DUMP				142
 #define OPAL_XIVE_RESERVED3			143
 #define OPAL_XIVE_RESERVED4			144
+#define OPAL_SIGNAL_SYSTEM_RESET                145
 #define OPAL_NPU_INIT_CONTEXT			146
 #define OPAL_NPU_DESTROY_CONTEXT		147
 #define OPAL_NPU_MAP_LPAR			148
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726c23304a57..7d7613c49f2b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
 
+int64_t opal_signal_system_reset(int32_t cpu);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..5f2c0367bab2 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -407,10 +407,28 @@ static const u8 srr1_to_lazyirq[0x10] = {
 	PACA_IRQ_HMI,
 	0, 0, 0, 0, 0 };
 
+/*
+ * System reset does not have to wait for Linux interrupts
+ * to be re-enabled, so just replay it now.
+ */
+static noinline void replay_system_reset(void)
+{
+	struct pt_regs regs;
+
+	ppc_save_regs(&regs);
+
+	get_paca()->in_nmi = 1;
+	system_reset_exception(&regs);
+	get_paca()->in_nmi = 0;
+}
+
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
 	unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
 
+	if (unlikely(idx == 4))
+		replay_system_reset();
+
 	/*
 	 * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
 	 * so this can be called unconditionally with srr1 wake reason.
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2f6eadd9408d..a6aa85b0cdeb 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -61,6 +61,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
  */
 static unsigned long __wd_smp_lock;
 static cpumask_t wd_smp_cpus_pending;
+static cpumask_t wd_smp_cpus_stuck_tmp;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
 
@@ -97,8 +98,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	else
 		dump_stack();
 
-	if (hardlockup_panic)
-		nmi_panic(regs, "Hard LOCKUP");
+	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
 static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
@@ -136,16 +136,29 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 
 	/*
 	 * Try to trigger the stuck CPUs.
+	 *
+	 * There is a bit of a hack for OPAL here because it can not
+	 * signal sibling threads. Don't try to signal those or mark
+	 * them stuck, in the hope that another core will notice.
 	 */
+	cpumask_clear(&wd_smp_cpus_stuck_tmp);
 	for_each_cpu(c, &wd_smp_cpus_pending) {
 		if (c == cpu)
 			continue;
-		smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+		if (firmware_has_feature(FW_FEATURE_OPAL)) {
+			if (cpumask_test_cpu(c, cpu_sibling_mask(cpu)))
+				continue;
+		}
+		cpumask_set_cpu(c, &wd_smp_cpus_stuck_tmp);
+		if (!sysctl_hardlockup_all_cpu_backtrace)
+			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
 	}
-	smp_flush_nmi_ipi(1000000);
 
 	/* Take the stuck CPUs out of the watch group */
-	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
+	set_cpumask_stuck(&wd_smp_cpus_stuck_tmp, tb);
+
+	if (!sysctl_hardlockup_all_cpu_backtrace)
+		smp_flush_nmi_ipi(1000000);
 
 	wd_smp_unlock(&flags);
 
@@ -275,9 +288,12 @@ void arch_touch_nmi_watchdog(void)
 {
 	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
+	u64 tb = get_tb();
 
-	if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
-		watchdog_timer_interrupt(cpu);
+	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
+		per_cpu(wd_timer_tb, cpu) = tb;
+		wd_smp_clear_cpu_pending(cpu, tb);
+	}
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index a159d48573d7..49add2037e0d 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_SMP
 extern void pnv_smp_init(void);
+extern int pnv_system_reset_exception(struct pt_regs *regs);
 #else
 static inline void pnv_smp_init(void) { }
 #endif
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..4fdaa1d7c4cd 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,9 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.restart = pnv_restart;
 	pm_power_off = pnv_power_off;
 	ppc_md.halt = pnv_halt;
+#ifdef CONFIG_SMP
+	ppc_md.system_reset_exception = pnv_system_reset_exception;
+#endif
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
 	ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..9da97962c93a 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -290,6 +290,93 @@ static void __init pnv_smp_probe(void)
 	}
 }
 
+static int nmi_ipi_bounce_cpu;
+static int nmi_ipi_bounce_target_core;
+static int nmi_ipi_bounce_target_exclude;
+
+int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	if (nmi_ipi_bounce_cpu == smp_processor_id()) {
+		int c;
+		nmi_ipi_bounce_cpu = -1;
+		for_each_online_cpu(c) {
+			if (!cpumask_test_cpu(c, cpu_sibling_mask(
+						nmi_ipi_bounce_target_core)))
+				continue;
+			if (c == nmi_ipi_bounce_target_exclude)
+				continue;
+			opal_signal_system_reset(
+					get_hard_smp_processor_id(c));
+			/* can't do much with failure here */
+		}
+	}
+
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	if (cpu >= 0) {
+		rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu));
+		if (rc == OPAL_SUCCESS)
+			return 1;
+		return 0;
+	} else {
+		int c;
+
+		/*
+		 * Some platforms can not send NMI to sibling threads in
+		 * the same core. We can designate one inter-core target
+		 * to bounce NMIs back to our sibling threads.
+		 */
+
+		if (cpu >= 0) {
+			/*
+			 * Don't support bouncing unicast NMIs yet (because
+			 * that would have to raise an NMI on an unrelated
+			 * CPU. Revisit this if callers start using unicast.
+			 */
+			return 0;
+		}
+
+		nmi_ipi_bounce_cpu = -1;
+		nmi_ipi_bounce_target_core = -1;
+		nmi_ipi_bounce_target_exclude = -1;
+
+		for_each_online_cpu(c) {
+			if (cpumask_test_cpu(c, cpu_sibling_mask(smp_processor_id())))
+				continue;
+
+			if (nmi_ipi_bounce_cpu == -1) {
+				nmi_ipi_bounce_cpu = c;
+				nmi_ipi_bounce_target_core = smp_processor_id();
+				if (cpu == NMI_IPI_ALL_OTHERS)
+					nmi_ipi_bounce_target_exclude = smp_processor_id();
+				smp_mb();
+			} else {
+				rc = opal_signal_system_reset(
+						get_hard_smp_processor_id(c));
+				if (rc != OPAL_SUCCESS)
+					return 0;
+			}
+		}
+
+		if (nmi_ipi_bounce_cpu == -1)
+			return 0; /* could not find a bouncer */
+		rc = opal_signal_system_reset(
+				get_hard_smp_processor_id(nmi_ipi_bounce_cpu));
+		if (rc != OPAL_SUCCESS)
+			return 0;
+		return 1;
+	}
+
+	return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
@@ -308,6 +395,8 @@ static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET))
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
 	smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 33351c6704b1..d9a12102b111 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 
  waiting:
 	secondary = 1;
+	spin_begin();
 	while (secondary && !xmon_gate) {
 		if (in_xmon == 0) {
-			if (fromipi)
+			if (fromipi) {
+				spin_end();
 				goto leave;
+			}
 			secondary = test_and_set_bit(0, &in_xmon);
 		}
-		barrier();
+		spin_cpu_relax();
+		touch_nmi_watchdog();
 	}
+	spin_end();
 
 	if (!secondary && !xmon_gate) {
 		/* we are the first cpu to come in */
@@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 		mb();
 		xmon_gate = 1;
 		barrier();
+		touch_nmi_watchdog();
 	}
 
  cmdloop:
 	while (in_xmon) {
 		if (secondary) {
+			spin_begin();
 			if (cpu == xmon_owner) {
 				if (!test_and_set_bit(0, &xmon_taken)) {
 					secondary = 0;
+					spin_end();
 					continue;
 				}
 				/* missed it */
 				while (cpu == xmon_owner)
-					barrier();
+					spin_cpu_relax();
 			}
-			barrier();
+			spin_cpu_relax();
+			touch_nmi_watchdog();
 		} else {
 			cmd = cmds(regs);
 			if (cmd != 0) {
Benjamin Herrenschmidt Sept. 14, 2017, 2:24 a.m. | #2
On Wed, 2017-09-13 at 23:13 +1000, Nicholas Piggin wrote:
> On Wed, 13 Sep 2017 02:05:53 +1000
> Nicholas Piggin <npiggin@gmail.com> wrote:
> 
> > There are two complications. The first is that sreset from stop states
> > come in with SRR1 set to do a powersave wakeup, with an sreset reason
> > encoded.
> > 
> > The second is that threads on the same core can't be signalled directly
> > so we must designate a bounce CPU to reflect the IPI back.
> 
> Here is an updated Linux patch for the latest OPAL patch. This has
> a few assorted fixes as well to make it work nicely, I roll them into
> one patch here to make it easy to apply for testing the OPAL patch.

Why can't you sreset threads of the same core on P9 ?

Cheers,
Ben.
Nicholas Piggin Sept. 14, 2017, 6:32 a.m. | #3
On Thu, 14 Sep 2017 12:24:49 +1000
Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:

> On Wed, 2017-09-13 at 23:13 +1000, Nicholas Piggin wrote:
> > On Wed, 13 Sep 2017 02:05:53 +1000
> > Nicholas Piggin <npiggin@gmail.com> wrote:
> >   
> > > There are two complications. The first is that sreset from stop states
> > > come in with SRR1 set to do a powersave wakeup, with an sreset reason
> > > encoded.
> > > 
> > > The second is that threads on the same core can't be signalled directly
> > > so we must designate a bounce CPU to reflect the IPI back.  
> > 
> > Here is an updated Linux patch for the latest OPAL patch. This has
> > a few assorted fixes as well to make it work nicely, I roll them into
> > one patch here to make it easy to apply for testing the OPAL patch.  
> 
> Why can't you sreset threads of the same core on P9 ?

It looks like we can, I think I had some other bugs still not ironed
out when I previously tested it.

That simplifies things a lot on the Linux side. It may be that the
bounce is still required if we implement it on POWER8 using ramming,
but I'll get the POWER9 code in first.

Thanks,
Nick
Alistair Popple Sept. 14, 2017, 6:43 a.m. | #4
On Thu, 14 Sep 2017 04:32:28 PM Nicholas Piggin wrote:
> On Thu, 14 Sep 2017 12:24:49 +1000
> Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote:
> 
> > On Wed, 2017-09-13 at 23:13 +1000, Nicholas Piggin wrote:
> > > On Wed, 13 Sep 2017 02:05:53 +1000
> > > Nicholas Piggin <npiggin@gmail.com> wrote:
> > >   
> > > > There are two complications. The first is that sreset from stop states
> > > > come in with SRR1 set to do a powersave wakeup, with an sreset reason
> > > > encoded.
> > > > 
> > > > The second is that threads on the same core can't be signalled directly
> > > > so we must designate a bounce CPU to reflect the IPI back.  
> > > 
> > > Here is an updated Linux patch for the latest OPAL patch. This has
> > > a few assorted fixes as well to make it work nicely, I roll them into
> > > one patch here to make it easy to apply for testing the OPAL patch.  
> > 
> > Why can't you sreset threads of the same core on P9 ?
> 
> It looks like we can, I think I had some other bugs still not ironed
> out when I previously tested it.
> 
> That simplifies things a lot on the Linux side. It may be that the
> bounce is still required if we implement it on POWER8 using ramming,
> but I'll get the POWER9 code in first.

Right, the bouncing is still required on P8 because we need to ram instructions
and you can only ram instructions if all threads on a core are quiesced.

- Alistair

>
> Thanks,
> Nick
Nicholas Piggin Sept. 14, 2017, 11:26 a.m. | #5
On Wed, 13 Sep 2017 02:05:53 +1000
Nicholas Piggin <npiggin@gmail.com> wrote:

> There are two complications. The first is that sreset from stop states
> come in with SRR1 set to do a powersave wakeup, with an sreset reason
> encoded.
> 
> The second is that threads on the same core can't be signalled directly
> so we must designate a bounce CPU to reflect the IPI back.

This is a revised patch with only DD2 enablement. DD2 allows threads on
the same core to be IPIed. It's much simpler, and most of the code is
fixing the watchdog and preventing it from triggering from xmon (which
will be split into other patches of course).

It's probably a better starting point to get this working and merged
first, then revisiting bouncing.

---
 arch/powerpc/include/asm/opal-api.h            |  1 +
 arch/powerpc/include/asm/opal.h                |  2 ++
 arch/powerpc/kernel/irq.c                      | 20 ++++++++++++++++++
 arch/powerpc/kernel/watchdog.c                 | 29 +++++++++++++++-----------
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/powernv.h       |  1 +
 arch/powerpc/platforms/powernv/setup.c         |  3 +++
 arch/powerpc/platforms/powernv/smp.c           | 24 +++++++++++++++++++++
 arch/powerpc/xmon/xmon.c                       | 17 +++++++++++----
 9 files changed, 82 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 450a60b81d2a..9d191ebea706 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -188,6 +188,7 @@
 #define OPAL_XIVE_DUMP				142
 #define OPAL_XIVE_RESERVED3			143
 #define OPAL_XIVE_RESERVED4			144
+#define OPAL_SIGNAL_SYSTEM_RESET		145
 #define OPAL_NPU_INIT_CONTEXT			146
 #define OPAL_NPU_DESTROY_CONTEXT		147
 #define OPAL_NPU_MAP_LPAR			148
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726c23304a57..7d7613c49f2b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
 
+int64_t opal_signal_system_reset(int32_t cpu);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..8ffebb9437e5 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -407,11 +407,31 @@ static const u8 srr1_to_lazyirq[0x10] = {
 	PACA_IRQ_HMI,
 	0, 0, 0, 0, 0 };
 
+static noinline void replay_system_reset(void)
+{
+	struct pt_regs regs;
+
+	ppc_save_regs(&regs);
+
+	get_paca()->in_nmi = 1;
+	system_reset_exception(&regs);
+	get_paca()->in_nmi = 0;
+}
+
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
 	unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
 
 	/*
+	 * 0100b SRR1 reason is system reset. Take it now,
+	 * which is immediately after registers are restored
+	 * from idle. It's an NMI, so interrupts needn't be
+	 * re-enabled.
+	 */
+	if (unlikely(idx == 4))
+		replay_system_reset();
+
+	/*
 	 * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
 	 * so this can be called unconditionally with srr1 wake reason.
 	 */
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2f6eadd9408d..1fb9379dc683 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -97,8 +97,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	else
 		dump_stack();
 
-	if (hardlockup_panic)
-		nmi_panic(regs, "Hard LOCKUP");
+	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
 static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
@@ -134,15 +133,18 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 	pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n",
 			cpu, cpumask_pr_args(&wd_smp_cpus_pending));
 
-	/*
-	 * Try to trigger the stuck CPUs.
-	 */
-	for_each_cpu(c, &wd_smp_cpus_pending) {
-		if (c == cpu)
-			continue;
-		smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+	if (!sysctl_hardlockup_all_cpu_backtrace) {
+		/*
+		 * Try to trigger the stuck CPUs, unless we are going to
+		 * get a backtrace on all of them anyway.
+		 */
+		for_each_cpu(c, &wd_smp_cpus_pending) {
+			if (c == cpu)
+				continue;
+			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+		}
+		smp_flush_nmi_ipi(1000000);
 	}
-	smp_flush_nmi_ipi(1000000);
 
 	/* Take the stuck CPUs out of the watch group */
 	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
@@ -275,9 +277,12 @@ void arch_touch_nmi_watchdog(void)
 {
 	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
+	u64 tb = get_tb();
 
-	if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
-		watchdog_timer_interrupt(cpu);
+	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
+		per_cpu(wd_timer_tb, cpu) = tb;
+		wd_smp_clear_cpu_pending(cpu, tb);
+	}
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index a159d48573d7..49add2037e0d 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -3,6 +3,7 @@
 
 #ifdef CONFIG_SMP
 extern void pnv_smp_init(void);
+extern int pnv_system_reset_exception(struct pt_regs *regs);
 #else
 static inline void pnv_smp_init(void) { }
 #endif
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..4fdaa1d7c4cd 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,9 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.restart = pnv_restart;
 	pm_power_off = pnv_power_off;
 	ppc_md.halt = pnv_halt;
+#ifdef CONFIG_SMP
+	ppc_md.system_reset_exception = pnv_system_reset_exception;
+#endif
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
 	ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..83343832e07e 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -290,6 +290,28 @@ static void __init pnv_smp_probe(void)
 	}
 }
 
+int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	rc = opal_signal_system_reset(cpu);
+	if (rc == OPAL_SUCCESS)
+		return 1;
+
+	/*
+	 * Don't cope with OPAL_PARTIAL yet (just punt to regular IPI)
+	 */
+
+	return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
@@ -308,6 +330,8 @@ static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET))
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
 	smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 33351c6704b1..d9a12102b111 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 
  waiting:
 	secondary = 1;
+	spin_begin();
 	while (secondary && !xmon_gate) {
 		if (in_xmon == 0) {
-			if (fromipi)
+			if (fromipi) {
+				spin_end();
 				goto leave;
+			}
 			secondary = test_and_set_bit(0, &in_xmon);
 		}
-		barrier();
+		spin_cpu_relax();
+		touch_nmi_watchdog();
 	}
+	spin_end();
 
 	if (!secondary && !xmon_gate) {
 		/* we are the first cpu to come in */
@@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 		mb();
 		xmon_gate = 1;
 		barrier();
+		touch_nmi_watchdog();
 	}
 
  cmdloop:
 	while (in_xmon) {
 		if (secondary) {
+			spin_begin();
 			if (cpu == xmon_owner) {
 				if (!test_and_set_bit(0, &xmon_taken)) {
 					secondary = 0;
+					spin_end();
 					continue;
 				}
 				/* missed it */
 				while (cpu == xmon_owner)
-					barrier();
+					spin_cpu_relax();
 			}
-			barrier();
+			spin_cpu_relax();
+			touch_nmi_watchdog();
 		} else {
 			cmd = cmds(regs);
 			if (cmd != 0) {

Patch

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 450a60b81d2a..bd9d1f2b3584 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -188,6 +188,7 @@ 
 #define OPAL_XIVE_DUMP				142
 #define OPAL_XIVE_RESERVED3			143
 #define OPAL_XIVE_RESERVED4			144
+#define OPAL_SIGNAL_SYSTEM_RESET 		145
 #define OPAL_NPU_INIT_CONTEXT			146
 #define OPAL_NPU_DESTROY_CONTEXT		147
 #define OPAL_NPU_MAP_LPAR			148
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726c23304a57..7d7613c49f2b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -281,6 +281,8 @@  int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
 
+int64_t opal_signal_system_reset(int32_t cpu);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..3276e05cb53f 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -407,10 +407,23 @@  static const u8 srr1_to_lazyirq[0x10] = {
 	PACA_IRQ_HMI,
 	0, 0, 0, 0, 0 };
 
+static noinline void system_reset(void)
+{
+	struct pt_regs regs;
+	ppc_save_regs(&regs);
+
+	get_paca()->in_nmi = 1;
+	system_reset_exception(&regs);
+	get_paca()->in_nmi = 0;
+}
+
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
 	unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
 
+	if (unlikely(idx == 2 || idx == 4))
+		system_reset();
+
 	/*
 	 * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
 	 * so this can be called unconditionally with srr1 wake reason.
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@  OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index a159d48573d7..49add2037e0d 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -3,6 +3,7 @@ 
 
 #ifdef CONFIG_SMP
 extern void pnv_smp_init(void);
+extern int pnv_system_reset_exception(struct pt_regs *regs);
 #else
 static inline void pnv_smp_init(void) { }
 #endif
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..4fdaa1d7c4cd 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,9 @@  static void __init pnv_setup_machdep_opal(void)
 	ppc_md.restart = pnv_restart;
 	pm_power_off = pnv_power_off;
 	ppc_md.halt = pnv_halt;
+#ifdef CONFIG_SMP
+	ppc_md.system_reset_exception = pnv_system_reset_exception;
+#endif
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
 	ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..45b1c191e3c8 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -290,6 +290,112 @@  static void __init pnv_smp_probe(void)
 	}
 }
 
+static int nmi_ipi_bounce_cpu;
+static int nmi_ipi_bounce_cpu_done;
+static int nmi_ipi_bounce_target_core;
+static int nmi_ipi_bounce_target_exclude;
+
+int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	smp_mb();
+	if (nmi_ipi_bounce_cpu == smp_processor_id()) {
+		int64_t rc;
+		int c;
+
+		nmi_ipi_bounce_cpu = -1;
+		smp_mb();
+		for_each_online_cpu(c) {
+			if (!cpumask_test_cpu(c, cpu_sibling_mask(nmi_ipi_bounce_target_core)))
+				continue;
+			if (c == nmi_ipi_bounce_target_exclude)
+				continue;
+			rc = opal_signal_system_reset(get_hard_smp_processor_id(c));
+			if (rc != OPAL_SUCCESS) {
+				nmi_ipi_bounce_cpu_done = -1;
+				return 1;
+			}
+		}
+		nmi_ipi_bounce_cpu_done = 1;
+	}
+
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	if (cpu >= 0) {
+		rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu));
+		if (rc == OPAL_SUCCESS)
+			return 1;
+		return 0;
+	} else {
+		/*
+		 * Test bounce behavior with broadcast IPI.
+		 */
+		rc = OPAL_PARTIAL;
+	}
+	if (rc == OPAL_PARTIAL) {
+		int c;
+
+		/*
+		 * Some platforms can not send NMI to sibling threads in
+		 * the same core. We can designate one inter-core target
+		 * to bounce NMIs back to our sibling threads.
+		 */
+
+		if (cpu >= 0) {
+			/*
+			 * Don't support bouncing unicast NMIs yet (because
+			 * that would have to raise an NMI on an unrelated
+			 * CPU. Revisit this if callers start using unicast.
+			 */
+			printk("CPU:%d pnv_cause_nmi_ipi can not bounce unicast IPIs!\n", smp_processor_id());
+			return 0;
+		}
+
+		nmi_ipi_bounce_cpu = -1;
+		nmi_ipi_bounce_cpu_done = 0;
+		nmi_ipi_bounce_target_core = -1;
+		nmi_ipi_bounce_target_exclude = -1;
+
+		for_each_online_cpu(c) {
+			if (cpumask_test_cpu(c, cpu_sibling_mask(smp_processor_id())))
+				continue;
+
+			if (nmi_ipi_bounce_cpu == -1) {
+				nmi_ipi_bounce_cpu = c;
+				nmi_ipi_bounce_target_core = smp_processor_id();
+				if (cpu == NMI_IPI_ALL_OTHERS)
+					nmi_ipi_bounce_target_exclude = smp_processor_id();
+				smp_mb();
+			} else {
+				rc = opal_signal_system_reset(get_hard_smp_processor_id(c));
+				if (rc != OPAL_SUCCESS)
+					return 0;
+			}
+		}
+
+		if (nmi_ipi_bounce_cpu == -1)
+			return 0; /* could not find a bouncer */
+
+		rc = opal_signal_system_reset(get_hard_smp_processor_id(nmi_ipi_bounce_cpu));
+		if (rc != OPAL_SUCCESS)
+			return 0;
+
+		while (!nmi_ipi_bounce_cpu_done)
+			cpu_relax();
+
+		if (nmi_ipi_bounce_cpu_done == 1)
+			return 1; /* bounce worked */
+	}
+
+	return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
@@ -308,6 +414,11 @@  static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) {
+		printk("OPAL_SIGNAL_SYSTEM_RESET available\n");
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
+	} else
+		printk("OPAL_SIGNAL_SYSTEM_RESET NOT available\n");
 	smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU