[RFC,3/3] core: Add support for quiescing OPAL

Message ID 20171022100648.1919-4-npiggin@gmail.com
State New
Headers show
Series
  • add support for quiescing OPAL
Related show

Commit Message

Nicholas Piggin Oct. 22, 2017, 10:06 a.m.
Quiescing is ensuring all host controlled CPUs (except the current one)
are out of OPAL and prevented from entering. This can be use in debug
and shutdown paths, particularly with system reset sequences.

This patch adds per-CPU entry and exit tracking for OPAL calls, and adds
logic to "hold" or "reject" at entry time, if OPAL is quiesced.

fast reboot is switched to using quiescing rather than "wait for a while".
If firmware can not be quiesced, then fast reboot is skipped. (untested
because my P8 is broken). If it can be quiesced, then it will generally
reduce fast reboot times by about 200ms.

An OPAL call is added, for the host to use in similar situations.
---
 asm/head.S                                    |  32 +----
 core/cpu.c                                    |   5 +
 core/fast-reboot.c                            |  21 ++-
 core/opal.c                                   | 183 ++++++++++++++++++++++++--
 core/platform.c                               |   4 +
 doc/opal-api/opal-quiesce-158.rst             |  54 ++++++++
 doc/opal-api/opal-signal-system-reset-145.rst |   7 +
 include/cpu.h                                 |   2 +
 include/lock.h                                |  12 ++
 include/opal-api.h                            |   8 +-
 include/opal-internal.h                       |   2 +
 11 files changed, 282 insertions(+), 48 deletions(-)
 create mode 100644 doc/opal-api/opal-quiesce-158.rst

Patch

diff --git a/asm/head.S b/asm/head.S
index d6b58be9..a221bab1 100644
--- a/asm/head.S
+++ b/asm/head.S
@@ -913,8 +913,7 @@  opal_entry:
 	std	%r1,STACK_GPR1(%r12)
 	mr	%r1,%r12
 
-	/* May save arguments for tracing */
-#ifdef OPAL_TRACE_ENTRY
+	/* Save arguments because we call C */
 	std	%r3,STACK_GPR3(%r1)
 	std	%r4,STACK_GPR4(%r1)
 	std	%r5,STACK_GPR5(%r1)
@@ -923,7 +922,7 @@  opal_entry:
 	std	%r8,STACK_GPR8(%r1)
 	std	%r9,STACK_GPR9(%r1)
 	std	%r10,STACK_GPR10(%r1)
-#endif
+
 	/* Save Token (r0), LR and r13 */
 	mflr	%r12
 	std	%r0,STACK_GPR0(%r1)
@@ -944,15 +943,10 @@  opal_entry:
 	addis	%r2,%r2,(__toc_start - __head)@ha
 	addi	%r2,%r2,(__toc_start - __head)@l
 
-	/* Check for a reboot in progress */
-	LOAD_ADDR_FROM_TOC(%r12, reboot_in_progress)
-	lbz	%r12,0(%r12)
-	cmpwi	%r12,0
-	bne	3f
-
-#ifdef OPAL_TRACE_ENTRY
 	mr	%r3,%r1
-	bl	opal_trace_entry
+	bl	opal_entry_check
+	cmpdi	%r3,0
+	bne	1f
 	ld	%r0,STACK_GPR0(%r1)
 	ld	%r3,STACK_GPR3(%r1)
 	ld	%r4,STACK_GPR4(%r1)
@@ -962,39 +956,25 @@  opal_entry:
 	ld	%r8,STACK_GPR8(%r1)
 	ld	%r9,STACK_GPR9(%r1)
 	ld	%r10,STACK_GPR10(%r1)
-#endif /* OPAL_TRACE_ENTRY */
 
 	/* Convert our token into a table entry and get the
 	 * function pointer. Also check the token.
 	 * For ELFv2 ABI, the local entry point is used so no need for r12.
 	 */
-	cmpldi	%r0,OPAL_LAST
-	bgt-	2f
 	sldi	%r0,%r0,3
 	LOAD_ADDR_FROM_TOC(%r12, opal_branch_table)
 	ldx	%r0,%r12,%r0
-	cmpldi	%r0,0
-	beq-	2f
 	mtctr	%r0
 
 	/* Jump ! */
 	bctrl
-
+	bl	opal_exit_check
 1:	ld	%r12,STACK_LR(%r1)
 	mtlr	%r12
 	ld	%r13,STACK_GPR13(%r1)
 	ld	%r1,STACK_GPR1(%r1)
 	blr
 
-2:	/* Bad token */
-	ld	%r3,STACK_GPR0(%r1)
-	bl	opal_bad_token
-	b	1b
-
-3:	/* Reboot in progress, reject all calls */
-	li	%r3,OPAL_BUSY
-	b	1b
-
 .global start_kernel
 start_kernel:
 	sync
diff --git a/core/cpu.c b/core/cpu.c
index 27e0d6cf..7553b08c 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -1189,6 +1189,11 @@  static int64_t opal_return_cpu(void)
 {
 	prlog(PR_DEBUG, "OPAL: Returning CPU 0x%04x\n", this_cpu()->pir);
 
+	this_cpu()->in_opal_call--;
+	if (this_cpu()->in_opal_call != 0) {
+		printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call);
+	}
+
 	__secondary_cpu_entry();
 
 	return OPAL_HARDWARE; /* Should not happen */
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
index 8af5c590..98e6f4f7 100644
--- a/core/fast-reboot.c
+++ b/core/fast-reboot.c
@@ -35,7 +35,6 @@ 
 
 
 /* Flag tested by the OPAL entry code */
-uint8_t reboot_in_progress;
 static volatile bool fast_boot_release;
 static struct cpu_thread *last_man_standing;
 static struct lock reset_lock = LOCK_UNLOCKED;
@@ -333,19 +332,17 @@  void fast_reboot(void)
 	}
 	unlock(&fast_reboot_disabled_lock);
 
+	/*
+	 * Ensure all other CPUs have left OPAL calls.
+	 */
+	if (!opal_quiesce(QUIESCE_HOLD, -1)) {
+		prlog(PR_DEBUG, "RESET: Fast reboot disabled because OPAL calls not quiescing\n");
+		return;
+	}
+
 	prlog(PR_NOTICE, "RESET: Initiating fast reboot %d...\n", ++fast_reboot_count);
 	free(fdt);
 
-	/* XXX We need a way to ensure that no other CPU is in skiboot
-	 * holding locks (via the OPAL APIs) and if they are, we need
-	 * for them to get out. Hopefully that isn't happening, but...
-	 *
-	 * To fix this properly, we want to keep track of OPAL entry/exit
-	 * on all CPUs.
-	 */
-	reboot_in_progress = 1;
-	time_wait_ms(200);
-
 	/* Lock so the new guys coming don't reset us */
 	lock(&reset_lock);
 
@@ -552,7 +549,7 @@  void __noreturn fast_reboot_entry(void)
 
 	/* Clear release flag for next time */
 	fast_boot_release = false;
-	reboot_in_progress = 0;
+	opal_quiesce(QUIESCE_RESUME, -1);
 
 	/* Cleanup ourselves */
 	cleanup_cpu_state();
diff --git a/core/opal.c b/core/opal.c
index 8095f731..6f627d4c 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -92,19 +92,12 @@  long opal_bad_token(uint64_t token)
 	return OPAL_PARAMETER;
 }
 
-/* Called from head.S, thus no prototype */
-void opal_trace_entry(struct stack_frame *eframe);
-
-void opal_trace_entry(struct stack_frame *eframe)
+static void opal_trace_entry(struct stack_frame *eframe __unused)
 {
+#ifdef OPAL_TRACE_ENTRY
 	union trace t;
 	unsigned nargs, i;
 
-	if (this_cpu()->pir != mfspr(SPR_PIR)) {
-		printf("CPU MISMATCH ! PIR=%04lx cpu @%p -> pir=%04x\n",
-		       mfspr(SPR_PIR), this_cpu(), this_cpu()->pir);
-		abort();
-	}
 	if (eframe->gpr[0] > OPAL_LAST)
 		nargs = 0;
 	else
@@ -117,7 +110,179 @@  void opal_trace_entry(struct stack_frame *eframe)
 		t.opal.r3_to_11[i] = cpu_to_be64(eframe->gpr[3+i]);
 
 	trace_add(&t, TRACE_OPAL, offsetof(struct trace_opal, r3_to_11[nargs]));
+#endif
+}
+
+/*
+ * opal_quiesce_state is used as a lock. Don't use an actual lock to avoid
+ * lock busting.
+ */
+static uint32_t opal_quiesce_state;	/* 0 or QUIESCE_HOLD/QUIESCE_REJECT */
+static int32_t opal_quiesce_owner;	/* PIR */
+static int32_t opal_quiesce_target;	/* -1 or PIR */
+
+static int64_t opal_check_token(uint64_t token);
+
+/* Called from head.S, thus no prototype */
+int64_t opal_entry_check(struct stack_frame *eframe);
+
+int64_t opal_entry_check(struct stack_frame *eframe)
+{
+	struct cpu_thread *cpu = this_cpu();
+	uint64_t token = eframe->gpr[0];
+
+	if (cpu->pir != mfspr(SPR_PIR)) {
+		printf("CPU MISMATCH ! PIR=%04lx cpu @%p -> pir=%04x\n",
+		       mfspr(SPR_PIR), cpu, cpu->pir);
+		abort();
+	}
+
+	opal_trace_entry(eframe);
+
+	if (!opal_check_token(token))
+		return opal_bad_token(token);
+
+	if (!opal_quiesce_state && cpu->in_opal_call) {
+		printf("CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=%04lx cpu @%p -> pir=%04x\n",
+		       mfspr(SPR_PIR), cpu, cpu->pir);
+		return OPAL_BUSY;
+	}
+
+again:
+	cpu->in_opal_call++;
+	sync(); /* Store in_opal_call vs load quiesce_opal_call */
+	if (cpu->quiesce_opal_call) {
+		cpu->in_opal_call--;
+		if (opal_quiesce_state == QUIESCE_REJECT)
+			return OPAL_BUSY;
+		smt_lowest();
+		while (cpu->quiesce_opal_call)
+			barrier();
+		smt_medium();
+		goto again;
+	}
+
+	return OPAL_SUCCESS;
+}
+
+void opal_exit_check(void);
+
+void opal_exit_check(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	if (!cpu->in_opal_call) {
+		printf("CPU UN-ACCOUNTED FIRMWARE ENTRY! PIR=%04lx cpu @%p -> pir=%04x\n",
+		       mfspr(SPR_PIR), cpu, cpu->pir);
+	} else {
+		cpu->in_opal_call--;
+	}
+}
+
+int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
+{
+	struct cpu_thread *cpu = this_cpu();
+	struct cpu_thread *c = NULL;
+	uint64_t end;
+	bool stuck = false;
+
+	if (cpu_target >= 0) {
+		c = find_cpu_by_server(cpu_target);
+		if (!c)
+			return OPAL_PARAMETER;
+	} else if (cpu_target != -1) {
+		return OPAL_PARAMETER;
+	}
+
+	if (quiesce_type == QUIESCE_HOLD || quiesce_type == QUIESCE_REJECT) {
+		if (cmpxchg32(&opal_quiesce_state, 0, quiesce_type) != 0) {
+			printf("opal_quiesce already quiescing\n");
+			return OPAL_BUSY;
+		}
+		opal_quiesce_owner = cpu->pir;
+		opal_quiesce_target = cpu_target;
+	}
+
+	if (opal_quiesce_owner != cpu->pir) {
+		printf("opal_quiesce CPU does not own quiesce state (must call QUIESCE_HOLD or QUIESCE_REJECT)\n");
+		return OPAL_BUSY;
+	}
+
+	/* Okay now we own the quiesce state */
+
+	if (quiesce_type == QUIESCE_RESUME) {
+		bust_locks = false;
+		sync();
+		if (c) {
+			c->quiesce_opal_call = false;
+		} else {
+			for_each_cpu(c) {
+				if (c == cpu)
+					continue;
+				c->quiesce_opal_call = false;
+			}
+		}
+		sync();
+		opal_quiesce_state = 0;
+		return OPAL_SUCCESS;
+	}
+
+	if (quiesce_type == QUIESCE_LOCK_BREAK) {
+		if (opal_quiesce_target != -1) {
+			printf("opal_quiesce has not quiesced all CPUs (must target -1)\n");
+			return OPAL_BUSY;
+		}
+		bust_locks = true;
+		return OPAL_SUCCESS;
+	}
+
+	if (c) {
+		c->quiesce_opal_call = true;
+	} else {
+		for_each_cpu(c) {
+			if (c == cpu)
+				continue;
+			c->quiesce_opal_call = true;
+		}
+	}
+	sync();
+
+	end = mftb() + msecs_to_tb(1000);
+
+	smt_lowest();
+	if (c) {
+		while (c->in_opal_call) {
+			if (tb_compare(mftb(), end) == TB_AAFTERB) {
+				printf("OPAL quiesce CPU:%04x stuck in OPAL\n", c->pir);
+				stuck = true;
+				break;
+			}
+			barrier();
+		}
+	} else {
+		for_each_cpu(c) {
+			if (c == cpu)
+				continue;
+			while (c->in_opal_call) {
+				if (tb_compare(mftb(), end) == TB_AAFTERB) {
+					printf("OPAL quiesce CPU:%04x stuck in OPAL\n", c->pir);
+					stuck = true;
+					break;
+				}
+				barrier();
+			}
+		}
+	}
+	smt_medium();
+
+	if (stuck) {
+		printf("OPAL quiesce could not kick all CPUs out of OPAL\n");
+		return OPAL_PARTIAL;
+	}
+
+	return OPAL_SUCCESS;
 }
+opal_call(OPAL_QUIESCE, opal_quiesce, 2);
 
 void __opal_register(uint64_t token, void *func, unsigned int nargs)
 {
diff --git a/core/platform.c b/core/platform.c
index 732f67e5..747b74d7 100644
--- a/core/platform.c
+++ b/core/platform.c
@@ -54,6 +54,8 @@  static int64_t opal_cec_reboot(void)
 {
 	prlog(PR_NOTICE, "OPAL: Reboot request...\n");
 
+	opal_quiesce(QUIESCE_HOLD, -1);
+
 	console_complete_flush();
 
 	/* Try fast-reset unless explicitly disabled */
@@ -71,6 +73,8 @@  static int64_t opal_cec_reboot2(uint32_t reboot_type, char *diag)
 {
 	struct errorlog *buf;
 
+	opal_quiesce(QUIESCE_HOLD, -1);
+
 	switch (reboot_type) {
 	case OPAL_REBOOT_NORMAL:
 		return opal_cec_reboot();
diff --git a/doc/opal-api/opal-quiesce-158.rst b/doc/opal-api/opal-quiesce-158.rst
new file mode 100644
index 00000000..d29d9835
--- /dev/null
+++ b/doc/opal-api/opal-quiesce-158.rst
@@ -0,0 +1,54 @@ 
+.. _opal-quiesce:
+
+OPAL_QUIESCE
+============
+
+The host OS can use OPAL_QUIESCE to ensure CPUs under host control are not
+executing OPAL. This is useful in crash or shutdown scenarios to try to
+ensure that CPUs are not holding locks, and is intended to be used with
+OPAL_SIGNAL_SYSTEM_RESET, for example.
+
+Arguments
+---------
+::
+
+  uint32_t quiesce_type
+    QUIESCE_HOLD        Wait for all target(s) currently executing OPAL to
+                        return to the host. Any new OPAL call that is made
+                        will be held off until QUIESCE_RESUME.
+    QUIESCE_REJECT      Wait for all target(s) currently executing OPAL to
+                        return to the host. Any new OPAL call that is made
+                        will fail with OPAL_BUSY until QUIESCE_RESUME.
+    QUIESCE_LOCK_BREAK  After QUIESCE_HOLD or QUIESCE_REJECT is successful,
+                        the CPU can call QUIESCE_LOCK_BREAK to skip all
+                        locking in OPAL to give the best chance of making
+                        progress in the crash/debug paths. The host should
+                        ensure all other CPUs are stopped (e.g., with
+                        OPAL_SIGNAL_SYSTEM_RESET) before this call is made, to
+                        avoid concurrency.
+    QUIESCE_RESUME      Undo the effects of QUIESCE_HOLD/QUIESCE_REJECT and
+                        QUIESCE_LOCK_BREAK calls.
+
+  int32_t target_cpu
+    cpu_nr >= 0        The cpu server number of the target cpu to reset.
+    -1                 All cpus except the current one should be quiesced.
+
+Returns
+-------
+OPAL_SUCCESS
+  The quiesce call was successful.
+
+OPAL_PARTIAL
+  Some or all of the CPUs executing OPAL when the call was made did not
+  return to the host after a timeout of 1 second. This is a best effort
+  at quiescing OPAL, and QUIESCE_RESUME must be called to resume normal
+  firmware operation.
+
+OPAL_PARAMETER
+  A parameter was incorrect.
+
+OPAL_BUSY
+  This CPU was not able to complete the operation, either because another
+  has concurrently started quiescing the system, or because it has not
+  successfully called QUIESCE_HOLD or QUIESCE_REJECT before attempting
+  QUIESCE_LOCK_BREAK or QUIESCE_RESUME.
diff --git a/doc/opal-api/opal-signal-system-reset-145.rst b/doc/opal-api/opal-signal-system-reset-145.rst
index 28e5e2f4..98baef72 100644
--- a/doc/opal-api/opal-signal-system-reset-145.rst
+++ b/doc/opal-api/opal-signal-system-reset-145.rst
@@ -17,6 +17,13 @@  raised when the target has MSR[RI]=0), so it should not be used in
 normal operation, but only for crashing, debugging, and similar
 exceptional cases.
 
+OPAL_SIGNAL_SYSTEM_RESET can pull CPUs out of OPAL, which may be
+undesirable in a crash or shutdown situation (e.g., because they may
+hold locks which are required to access the console, or may be halfway
+through setting hardware registers), so OPAL_QUIESCE can be used
+before OPAL_SIGNAL_SYSTEM_RESET to (attempt to) ensure all CPUs are
+out of OPAL before being interrupted.
+
 Arguments
 ---------
 ::
diff --git a/include/cpu.h b/include/cpu.h
index 168fa994..bb0b4eaa 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -54,9 +54,11 @@  struct cpu_thread {
 	struct trace_info		*trace;
 	uint64_t			save_r1;
 	void				*icp_regs;
+	uint32_t			in_opal_call;
 	uint32_t			lock_depth;
 	uint32_t			con_suspend;
 	bool				con_need_flush;
+	bool				quiesce_opal_call;
 	bool				in_mcount;
 	bool				in_poller;
 	bool				in_reinit;
diff --git a/include/lock.h b/include/lock.h
index b42afaa4..1597f422 100644
--- a/include/lock.h
+++ b/include/lock.h
@@ -18,6 +18,7 @@ 
 #define __LOCK_H
 
 #include <stdbool.h>
+#include <processor.h>
 
 struct lock {
 	/* Lock value has bit 63 as lock bit and the PIR of the owner
@@ -106,6 +107,17 @@  static inline uint64_t __cmpxchg64(uint64_t *mem, uint64_t old, uint64_t new)
 	return prev;
 }
 
+static inline uint32_t cmpxchg32(uint32_t *mem, uint32_t old, uint32_t new)
+{
+	uint32_t prev;
+
+	sync();
+	prev = __cmpxchg32(mem, old,new);
+	sync();
+
+	return prev;
+}
+
 extern bool try_lock(struct lock *l);
 extern void lock(struct lock *l);
 extern void unlock(struct lock *l);
diff --git a/include/opal-api.h b/include/opal-api.h
index 0bc036ed..c4b45d89 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -214,7 +214,13 @@ 
 #define OPAL_SET_POWER_SHIFT_RATIO		155
 #define OPAL_SENSOR_GROUP_CLEAR			156
 #define OPAL_PCI_SET_P2P			157
-#define OPAL_LAST				157
+#define OPAL_QUIESCE				158
+#define OPAL_LAST				158
+
+#define QUIESCE_HOLD			1 /* Spin all calls at entry */
+#define QUIESCE_REJECT			2 /* Fail all calls with OPAL_BUSY */
+#define QUIESCE_LOCK_BREAK		3 /* Set to ignore locks. */
+#define QUIESCE_RESUME			4 /* Un-quiesce */
 
 /* Device tree flags */
 
diff --git a/include/opal-internal.h b/include/opal-internal.h
index 583e9994..8d3d0a17 100644
--- a/include/opal-internal.h
+++ b/include/opal-internal.h
@@ -61,6 +61,8 @@  extern void add_opal_node(void);
 			(func), (nargs))
 extern void __opal_register(uint64_t token, void *func, unsigned num_args);
 
+int64_t opal_quiesce(uint32_t shutdown_type, int32_t cpu);
+
 /* Warning: no locking at the moment, do at init time only
  *
  * XXX TODO: Add the big RCU-ish "opal API lock" to protect us here