[2/4] asm/head: implement quiescing without stack or clobbering regs

Message ID 20180408064939.18879-3-npiggin@gmail.com
State Accepted
Headers show
Series
  • next round of OPAL debugging improvements
Related show

Commit Message

Nicholas Piggin April 8, 2018, 6:49 a.m.
Quiescing currently is implmeented in C in opal_entry before the
opal call handler is called. This works well enough for simple
cases like fast reset when one CPU wants all others out of the way.

Linux would like to use it to prevent an sreset IPI from
interrupting firmware, which could lead to deadlocks when crash
dumping or entering the debugger. Linux interrupts do not recover
well when returning back to general OPAL code, due to r13 not being
restored. OPAL also can't be re-entered, which may happen e.g.,
from the debugger.

So move the quiesce hold/reject to entry code, beore the stack or
r1 or r13 registers are switched. OPAL can be interrupted and
returned to or re-entered during this period.

This does not completely solve all such problems. OPAL will be
interrupted with sreset if the quiesce times out, and it can be
interrupted by MCEs as well. These still have the issues above.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 asm/asm-offsets.c |  2 ++
 asm/head.S        | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 core/opal.c       | 41 +++++++++++--------------------
 include/cpu.h     |  2 +-
 4 files changed, 83 insertions(+), 34 deletions(-)

Patch

diff --git a/asm/asm-offsets.c b/asm/asm-offsets.c
index 71199503..3eac592d 100644
--- a/asm/asm-offsets.c
+++ b/asm/asm-offsets.c
@@ -37,6 +37,8 @@  int main(void)
 	OFFSET(CPUTHREAD_PIR, cpu_thread, pir);
 	OFFSET(CPUTHREAD_SAVE_R1, cpu_thread, save_r1);
 	OFFSET(CPUTHREAD_STATE, cpu_thread, state);
+	OFFSET(CPUTHREAD_IN_OPAL_CALL, cpu_thread, in_opal_call);
+	OFFSET(CPUTHREAD_QUIESCE_OPAL_CALL, cpu_thread, quiesce_opal_call);
 	OFFSET(CPUTHREAD_CUR_TOKEN, cpu_thread, current_token);
 	DEFINE(CPUTHREAD_GAP, sizeof(struct cpu_thread) + STACK_SAFETY_GAP);
 #ifdef STACK_CHECK_ENABLED
diff --git a/asm/head.S b/asm/head.S
index ad306252..eeefcaa4 100644
--- a/asm/head.S
+++ b/asm/head.S
@@ -955,14 +955,64 @@  opal_boot_trampoline:
  *       r0: Token
  *       r2: OPAL Base
  *  r3..r10: Args
- *      r12: Scratch
+ * r11..r12: Scratch
  * r13..r31: Preserved
- *
  */
 	.balign	0x10
 .global opal_entry
 opal_entry:
-	/* Get our per CPU stack */
+	/* Get our per CPU pointer in r12 to check for quiesce */
+	mfspr	%r12,SPR_PIR
+	GET_STACK(%r12,%r12)
+
+	/* Get CPU thread */
+	clrrdi	%r12,%r12,STACK_SHIFT
+
+	/*
+	 * OPAL entry must first increment in_opal_call, then check
+	 * for quiesce, without touching the stack or clobbering
+	 * registers other than r11 and r12 and cr0. In this way, OPAL
+	 * is tolerant of re-entry on this same CPU while it is spinning
+	 * for quiesce.
+	 *
+	 * Sequence goes:
+	 * in_opal_call++;
+	 * sync;
+	 * if (quiesce_opal_call) {
+	 *     in_opal_call--;
+	 *     reject-or-spin-then-retry;
+	 */
+1:	lwz	%r11,CPUTHREAD_IN_OPAL_CALL(%r12)
+	addi	%r11,%r11,1
+	stw	%r11,CPUTHREAD_IN_OPAL_CALL(%r12)
+	/*
+	 * Order the store in_opal_call vs load quiesce_opal_call.
+	 * This also provides an acquire barrier for opal entry vs
+	 * another thread quiescing opal. In this way, quiescing
+	 * can behave as mutual exclusion.
+	 */
+	sync
+	lwz	%r11,CPUTHREAD_QUIESCE_OPAL_CALL(%r12)
+	cmpwi	%cr0,%r11,0
+	beq+	4f
+	/* We are quiescing, hold or reject */
+	cmpwi	%cr0,%r11,QUIESCE_REJECT
+	bne	2f
+	li	%r3,OPAL_BUSY
+	b	.Lreturn /* reject */
+2:	/* hold */
+	lwz	%r11,CPUTHREAD_IN_OPAL_CALL(%r12)
+	subi	%r11,%r11,1
+	stw	%r11,CPUTHREAD_IN_OPAL_CALL(%r12)
+	smt_lowest
+3:	lwz	%r11,CPUTHREAD_QUIESCE_OPAL_CALL(%r12)
+	cmpwi	%cr0,%r11,QUIESCE_HOLD
+	beq	3b
+	/* spin finished, try again */
+	smt_medium
+	b	1b
+
+4:	/* Quiesce protocol done, get our per CPU stack */
 	mfspr	%r12,SPR_PIR
 	GET_STACK(%r12,%r12)
 	stdu	%r12,-STACK_FRAMESIZE(%r12)
@@ -1006,7 +1056,7 @@  opal_entry:
 	mr	%r3,%r1
 	bl	opal_entry_check
 	cmpdi	%r3,0
-	bne	1f
+	bne	.Lreturn
 
 	ld	%r0,STACK_GPR0(%r1)
 	ld	%r3,STACK_GPR3(%r1)
@@ -1031,12 +1081,22 @@  opal_entry:
 	bctrl
 
 	mr	%r4,%r1
-	bl	opal_exit_check
+	bl	opal_exit_check /* r3 is preserved */
 
-1:	ld	%r12,STACK_LR(%r1)
+	/*
+	 * Restore r1 and r13 before decrementing in_opal_call.
+	 * Move per-cpu pointer to volatile r12, restore lr, r1, r13.
+	 */
+.Lreturn:
+	ld	%r12,STACK_LR(%r1)
 	mtlr	%r12
+	mr	%r12,%r13
 	ld	%r13,STACK_GPR13(%r1)
 	ld	%r1,STACK_GPR1(%r1)
+	sync 	/* release barrier vs quiescing */
+	lwz	%r11,CPUTHREAD_IN_OPAL_CALL(%r12)
+	subi	%r11,%r11,1
+	stw	%r11,CPUTHREAD_IN_OPAL_CALL(%r12)
 	blr
 
 .global start_kernel
diff --git a/core/opal.c b/core/opal.c
index e57f0a18..f6922b26 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -142,7 +142,7 @@  int64_t opal_entry_check(struct stack_frame *eframe)
 	if (!opal_check_token(token))
 		return opal_bad_token(token);
 
-	if (!opal_quiesce_state && cpu->in_opal_call) {
+	if (!opal_quiesce_state && cpu->in_opal_call > 1) {
 		disable_fast_reboot("Kernel re-entered OPAL");
 		switch (token) {
 		case OPAL_CONSOLE_READ:
@@ -158,30 +158,14 @@  int64_t opal_entry_check(struct stack_frame *eframe)
 		default:
 			printf("CPU ATTEMPT TO RE-ENTER FIRMWARE! PIR=%04lx cpu @%p -> pir=%04x token=%llu\n",
 			       mfspr(SPR_PIR), cpu, cpu->pir, token);
+			if (cpu->in_opal_call > 2) {
+				printf("Emergency stack is destroyed, can't continue.\n");
+				abort();
+			}
 			return OPAL_INTERNAL_ERROR;
 		}
 	}
 
-again:
-	cpu->in_opal_call++;
-	/*
-	 * Order the store in_opal_call vs load quiesce_opal_call.
-	 * This also provides an acquire barrier for opal entry vs
-	 * another thread quiescing opal. In this way, quiescing
-	 * can behave as mutual exclusion.
-	 */
-	sync();
-	if (cpu->quiesce_opal_call) {
-		cpu->in_opal_call--;
-		if (opal_quiesce_state == QUIESCE_REJECT)
-			return OPAL_BUSY;
-		smt_lowest();
-		while (cpu->quiesce_opal_call)
-			barrier();
-		smt_medium();
-		goto again;
-	}
-
 	return OPAL_SUCCESS;
 }
 
@@ -196,14 +180,17 @@  int64_t opal_exit_check(int64_t retval, struct stack_frame *eframe)
 		disable_fast_reboot("Un-accounted firmware entry");
 		printf("CPU UN-ACCOUNTED FIRMWARE ENTRY! PIR=%04lx cpu @%p -> pir=%04x token=%llu retval=%lld\n",
 		       mfspr(SPR_PIR), cpu, cpu->pir, token, retval);
+		cpu->in_opal_call++; /* avoid exit path underflowing */
 	} else {
+		if (cpu->in_opal_call > 2) {
+			printf("Emergency stack is destroyed, can't continue.\n");
+			abort();
+		}
 		if (!list_empty(&cpu->locks_held)) {
 			prlog(PR_ERR, "OPAL exiting with locks held, token=%llu retval=%lld\n",
 			      token, retval);
 			drop_my_locks(true);
 		}
-		sync(); /* release barrier vs quiescing */
-		cpu->in_opal_call--;
 	}
 	return retval;
 }
@@ -253,7 +240,7 @@  int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
 		bust_locks = false;
 		sync(); /* release barrier vs opal entry */
 		if (target) {
-			target->quiesce_opal_call = false;
+			target->quiesce_opal_call = 0;
 		} else {
 			for_each_cpu(c) {
 				if (quiesce_type == QUIESCE_RESUME_FAST_REBOOT)
@@ -263,7 +250,7 @@  int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
 					assert(!c->quiesce_opal_call);
 					continue;
 				}
-				c->quiesce_opal_call = false;
+				c->quiesce_opal_call = 0;
 			}
 		}
 		sync();
@@ -281,12 +268,12 @@  int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
 	}
 
 	if (target) {
-		target->quiesce_opal_call = true;
+		target->quiesce_opal_call = quiesce_type;
 	} else {
 		for_each_cpu(c) {
 			if (c == cpu)
 				continue;
-			c->quiesce_opal_call = true;
+			c->quiesce_opal_call = quiesce_type;
 		}
 	}
 
diff --git a/include/cpu.h b/include/cpu.h
index b7cd588d..92ba83e8 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -61,10 +61,10 @@  struct cpu_thread {
 	uint64_t			save_r1;
 	void				*icp_regs;
 	uint32_t			in_opal_call;
+	uint32_t			quiesce_opal_call;
 	uint32_t			con_suspend;
 	struct list_head		locks_held;
 	bool				con_need_flush;
-	bool				quiesce_opal_call;
 	bool				in_mcount;
 	bool				in_poller;
 	bool				in_reinit;