diff mbox series

[v3,1/8] Add basic P9 fused core support

Message ID 20190324172543.12625-2-svaidy@linux.vnet.ibm.com
State Changes Requested
Headers show
Series Initial fused-core support for POWER9 | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch master (b392d785eb49630b9f00fef8d17944ed82b2c1fe)
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot success Test snowpatch/job/snowpatch-skiboot on branch master
snowpatch_ozlabs/snowpatch_job_snowpatch-skiboot-dco success Signed-off-by present

Commit Message

Vaidyanathan Srinivasan March 24, 2019, 5:25 p.m. UTC
From: Ryan Grimm <grimm@linux.vnet.ibm.com>

P9 cores can be configured into fused core mode where two core chiplets
function as an 8-threaded, single core.  So, bump four to eight in boot_entry
when in fused core mode and cpu_thread_count in init_boot_cpu.

The HID, AMOR, TSCR, RPR require the first active thread on that core chiplet
to load the copy for that core chiplet.  So, send thread 1 of a fused core to
init_shared_sprs in boot_entry.

The code checks for fused core mode in the core thead state register and puts a
field in struct cpu_thread.  This flag is checked when updating the HID and in
XIVE code when setting the special bar.

For XSCOM, the core ID is the non-fused EX.  So, create macros to arrange the
bits.  It's fairly verbose but somewhat readable.

This was tested on a P9 ZZ with 16 fused cores and ran HTX for over 24 hours.

Signed-off-by: Ryan Grimm <grimm@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Neuling <mikey@neuling.org>
---
 asm/head.S               | 24 +++++++++++++++++++++---
 core/chip.c              | 15 +++++++++++----
 core/cpu.c               | 39 ++++++++++++++++++++++++++++++++++-----
 core/fast-reboot.c       |  2 +-
 hdata/test/hdata_to_dt.c |  9 ++++++++-
 hw/xive.c                |  2 +-
 include/chip.h           | 31 +++++++++++++++++++++++++++++++
 include/cpu.h            |  6 ++++++
 include/xscom.h          |  3 +++
 9 files changed, 116 insertions(+), 15 deletions(-)

Comments

Stewart Smith May 20, 2019, 7 a.m. UTC | #1
Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> writes:
> From: Ryan Grimm <grimm@linux.vnet.ibm.com>
>
> P9 cores can be configured into fused core mode where two core chiplets
> function as an 8-threaded, single core.  So, bump four to eight in boot_entry
> when in fused core mode and cpu_thread_count in init_boot_cpu.
>
> The HID, AMOR, TSCR, RPR require the first active thread on that core chiplet
> to load the copy for that core chiplet.  So, send thread 1 of a fused core to
> init_shared_sprs in boot_entry.
>
> The code checks for fused core mode in the core thead state register and puts a
> field in struct cpu_thread.  This flag is checked when updating the HID and in
> XIVE code when setting the special bar.
>
> For XSCOM, the core ID is the non-fused EX.  So, create macros to arrange the
> bits.  It's fairly verbose but somewhat readable.
>
> This was tested on a P9 ZZ with 16 fused cores and ran HTX for over 24 hours.
>
> Signed-off-by: Ryan Grimm <grimm@linux.vnet.ibm.com>
> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Signed-off-by: Michael Neuling <mikey@neuling.org>
> ---
>  asm/head.S               | 24 +++++++++++++++++++++---
>  core/chip.c              | 15 +++++++++++----
>  core/cpu.c               | 39 ++++++++++++++++++++++++++++++++++-----
>  core/fast-reboot.c       |  2 +-
>  hdata/test/hdata_to_dt.c |  9 ++++++++-
>  hw/xive.c                |  2 +-
>  include/chip.h           | 31 +++++++++++++++++++++++++++++++
>  include/cpu.h            |  6 ++++++
>  include/xscom.h          |  3 +++
>  9 files changed, 116 insertions(+), 15 deletions(-)

So, I'm finding a bit of a strange issue when I apply this patch.

Firstly, it seems that we've not been really enabling SMT4 mode with
Mambo and instead been running without the "normal core mode" bit set in
SIM_CTRL1

--- a/external/mambo/skiboot.tcl
+++ b/external/mambo/skiboot.tcl
@@ -110,7 +110,7 @@ if { $default_config == "PEGASUS" } {
 if { $default_config == "P9" } {
     # PVR configured for POWER9 DD2.0 Scale out 24 Core (ie SMT4)
     myconf config processor/initial/PVR 0x4e1200
-    myconf config processor/initial/SIM_CTRL1 0xc228100400000000
+    myconf config processor/initial/SIM_CTRL1 0xc268100400000000
 
     if { $mconf(numa) } {
         myconf config memory_region_id_shift 45

and then things work again and we don't assume in all our Mambo running
tests that we're big core mode.

However, with that bit flipped, in the SMT tests, threads!=0 think the
PVR is 0x000, which works about as well as you'd expect.

I'm asking internally if there's anything I'm missing in the Mambo
config and what could be wrong. Any chance you have some insight?

I can't really merge until 'make check' still passes.
diff mbox series

Patch

diff --git a/asm/head.S b/asm/head.S
index 0ed1acdd..67d493ed 100644
--- a/asm/head.S
+++ b/asm/head.S
@@ -336,6 +336,7 @@  boot_offset:
  *   r28 :  PVR
  *   r27 :  DTB pointer (or NULL)
  *   r26 :  PIR thread mask
+ *   r25 :  P9 fused core flag
  */
 .global boot_entry
 boot_entry:
@@ -354,13 +355,21 @@  boot_entry:
 	cmpwi	cr0,%r3,PVR_TYPE_P8NVL
 	beq	2f
 	cmpwi	cr0,%r3,PVR_TYPE_P9
-	beq 	1f
+	beq 	3f
 	cmpwi	cr0,%r3,PVR_TYPE_P9P
-	beq 	1f
+	beq 	3f
 	attn		/* Unsupported CPU type... what do we do ? */
 	b 	.	/* loop here, just in case attn is disabled */
 
-	/* P8 -> 8 threads */
+	/* Check for fused core and set flag */
+3:
+	li	%r3, 0x1e0
+	mtspr   SPR_SPRC, %r3
+	mfspr	%r3, SPR_SPRD
+	andi.	%r25, %r3, 1
+	beq 1f
+
+	/* P8 or P9 fused -> 8 threads */
 2:	li	%r26,7
 
 	/* Get our reloc offset into r30 */
@@ -382,6 +391,15 @@  boot_entry:
 	LOAD_IMM64(%r3, (MSR_HV | MSR_SF))
 	mtmsrd	%r3,0
 
+	/* If fused, t1 is primary chiplet and must init shared sprs */
+	andi.	%r3,%r25,1
+	beq	not_fused
+
+	mfspr	%r31,SPR_PIR
+	andi.	%r3,%r31,1
+	bnel	init_shared_sprs
+
+not_fused:
 	/* Check our PIR, avoid threads */
 	mfspr	%r31,SPR_PIR
 	and.	%r0,%r31,%r26
diff --git a/core/chip.c b/core/chip.c
index 65263253..2b9b6ef9 100644
--- a/core/chip.c
+++ b/core/chip.c
@@ -20,6 +20,7 @@ 
 #include <console.h>
 #include <device.h>
 #include <timebase.h>
+#include <cpu.h>
 
 static struct proc_chip *chips[MAX_CHIPS];
 enum proc_chip_quirks proc_chip_quirks;
@@ -37,7 +38,10 @@  uint32_t pir_to_chip_id(uint32_t pir)
 uint32_t pir_to_core_id(uint32_t pir)
 {
 	if (proc_gen == proc_gen_p9)
-		return P9_PIR2COREID(pir);
+		if (this_cpu()->is_fused_core)
+			return P9_PIRFUSED2NORMALCOREID(pir);
+		else
+			return P9_PIR2COREID(pir);
 	else if (proc_gen == proc_gen_p8)
 		return P8_PIR2COREID(pir);
 	else
@@ -46,9 +50,12 @@  uint32_t pir_to_core_id(uint32_t pir)
 
 uint32_t pir_to_thread_id(uint32_t pir)
 {
-	if (proc_gen == proc_gen_p9)
-		return P9_PIR2THREADID(pir);
-	else if (proc_gen == proc_gen_p8)
+	if (proc_gen == proc_gen_p9) {
+		if (this_cpu()->is_fused_core)
+			return P9_PIR2FUSEDTHREADID(pir);
+		else
+			return P9_PIR2THREADID(pir);
+	} else if (proc_gen == proc_gen_p8)
 		return P8_PIR2THREADID(pir);
 	else
 		return P7_PIR2THREADID(pir);
diff --git a/core/cpu.c b/core/cpu.c
index d9d47133..1bcd2b66 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -922,6 +922,14 @@  void cpu_disable_all_threads(struct cpu_thread *cpu)
 	/* XXX Do something to actually stop the core */
 }
 
+static int is_fused_core (void)
+{
+	unsigned int core_thread_state;
+	mtspr(SPR_SPRC, 0x00000000000001e0ULL);
+	core_thread_state = mfspr(SPR_SPRD);
+	return core_thread_state & PPC_BIT(63);
+}
+
 static void init_cpu_thread(struct cpu_thread *t,
 			    enum cpu_thread_state state,
 			    unsigned int pir)
@@ -941,6 +949,7 @@  static void init_cpu_thread(struct cpu_thread *t,
 #ifdef STACK_CHECK_ENABLED
 	t->stack_bot_mark = LONG_MAX;
 #endif
+	t->is_fused_core = is_fused_core();
 	assert(pir == container_of(t, struct cpu_stack, cpu) - cpu_stacks);
 }
 
@@ -1034,14 +1043,16 @@  void init_boot_cpu(void)
 		      " (max %d threads/core)\n", cpu_thread_count);
 		break;
 	case proc_gen_p9:
-		cpu_thread_count = 4;
+		if (is_fused_core())
+			cpu_thread_count = 8;
+		else
+			cpu_thread_count = 4;
 		prlog(PR_INFO, "CPU: P9 generation processor"
 		      " (max %d threads/core)\n", cpu_thread_count);
 		break;
 	default:
 		prerror("CPU: Unknown PVR, assuming 1 thread\n");
 		cpu_thread_count = 1;
-		cpu_max_pir = mfspr(SPR_PIR);
 	}
 
 	if (is_power9n(pvr) && (PVR_VERS_MAJ(pvr) == 1)) {
@@ -1169,7 +1180,7 @@  void init_all_cpus(void)
 
 	/* Iterate all CPUs in the device-tree */
 	dt_for_each_child(cpus, cpu) {
-		unsigned int pir, server_no, chip_id;
+		unsigned int pir, server_no, chip_id, threads;
 		enum cpu_thread_state state;
 		const struct dt_property *p;
 		struct cpu_thread *t, *pt;
@@ -1197,6 +1208,14 @@  void init_all_cpus(void)
 		prlog(PR_INFO, "CPU: CPU from DT PIR=0x%04x Server#=0x%x"
 		      " State=%d\n", pir, server_no, state);
 
+		/* Check max PIR */
+		if (cpu_max_pir < (pir + cpu_thread_count - 1)) {
+			prlog(PR_WARNING, "CPU: CPU potentially out of range"
+			      "PIR=0x%04x MAX=0x%04x !\n",
+			      pir, cpu_max_pir);
+			continue;
+		}
+
 		/* Setup thread 0 */
 		assert(pir <= cpu_max_pir);
 		t = pt = &cpu_stacks[pir].cpu;
@@ -1222,11 +1241,21 @@  void init_all_cpus(void)
 		/* Add the decrementer width property */
 		dt_add_property_cells(cpu, "ibm,dec-bits", dec_bits);
 
+		if (t->is_fused_core)
+			dt_add_property(t->node, "ibm,fused-core", NULL, 0);
+
 		/* Iterate threads */
 		p = dt_find_property(cpu, "ibm,ppc-interrupt-server#s");
 		if (!p)
 			continue;
-		for (thread = 1; thread < (p->len / 4); thread++) {
+		threads = p->len / 4;
+		if (threads > cpu_thread_count) {
+			prlog(PR_WARNING, "CPU: Threads out of range for PIR 0x%04x"
+			      " threads=%d max=%d\n",
+			      pir, threads, cpu_thread_count);
+			threads = cpu_thread_count;
+		}
+		for (thread = 1; thread < threads; thread++) {
 			prlog(PR_TRACE, "CPU:   secondary thread %d found\n",
 			      thread);
 			t = &cpu_stacks[pir + thread].cpu;
@@ -1412,7 +1441,7 @@  static int64_t cpu_change_all_hid0(struct hid0_change_req *req)
 	assert(jobs);
 
 	for_each_available_cpu(cpu) {
-		if (!cpu_is_thread0(cpu))
+		if (!cpu_is_thread0(cpu) && !cpu_is_core_chiplet_primary(cpu))
 			continue;
 		if (cpu == this_cpu())
 			continue;
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
index 22160b65..be70c227 100644
--- a/core/fast-reboot.c
+++ b/core/fast-reboot.c
@@ -236,7 +236,7 @@  static void cleanup_cpu_state(void)
 	struct cpu_thread *cpu = this_cpu();
 
 	/* Per core cleanup */
-	if (cpu_is_thread0(cpu)) {
+	if (cpu_is_thread0(cpu) | cpu_is_core_chiplet_primary(cpu)) {
 		/* Shared SPRs whacked back to normal */
 
 		/* XXX Update the SLW copies ! Also dbl check HIDs etc... */
diff --git a/hdata/test/hdata_to_dt.c b/hdata/test/hdata_to_dt.c
index a5f152e8..cddb1d43 100644
--- a/hdata/test/hdata_to_dt.c
+++ b/hdata/test/hdata_to_dt.c
@@ -47,7 +47,11 @@  struct spira_ntuple;
 static void *ntuple_addr(const struct spira_ntuple *n);
 
 /* Stuff which core expects. */
-#define __this_cpu ((struct cpu_thread *)NULL)
+struct cpu_thread *my_fake_cpu;
+static struct cpu_thread *this_cpu(void)
+{
+	return my_fake_cpu;
+}
 
 unsigned long tb_hz = 512000000;
 
@@ -84,6 +88,7 @@  unsigned long tb_hz = 512000000;
 struct cpu_thread {
 	uint32_t			pir;
 	uint32_t			chip_id;
+	bool				is_fused_core;
 };
 struct cpu_job *__cpu_queue_job(struct cpu_thread *cpu,
 				const char *name,
@@ -105,6 +110,8 @@  static inline struct cpu_job *cpu_queue_job(struct cpu_thread *cpu,
 struct cpu_thread __boot_cpu, *boot_cpu = &__boot_cpu;
 static unsigned long fake_pvr = PVR_P7;
 
+unsigned int cpu_thread_count = 8;
+
 static inline unsigned long mfspr(unsigned int spr)
 {
 	assert(spr == SPR_PVR);
diff --git a/hw/xive.c b/hw/xive.c
index b863b634..c9f3f07d 100644
--- a/hw/xive.c
+++ b/hw/xive.c
@@ -3299,7 +3299,7 @@  static void xive_init_cpu(struct cpu_thread *c)
 	 * of a pair is present we just do the setup for each of them, which
 	 * is harmless.
 	 */
-	if (cpu_is_thread0(c))
+	if (cpu_is_thread0(c) || cpu_is_core_chiplet_primary(c))
 		xive_configure_ex_special_bar(x, c);
 
 	/* Initialize the state structure */
diff --git a/include/chip.h b/include/chip.h
index d6e7e355..a73a52d7 100644
--- a/include/chip.h
+++ b/include/chip.h
@@ -91,6 +91,26 @@ 
  * thus we have a 6-bit core number.
  *
  * Note: XIVE Only supports 4-bit chip numbers ...
+ *
+ * Upper PIR Bits
+ * --------------
+ *
+ * Normal-Core Mode:
+ * 57:61 CoreID
+ * 62:63 ThreadID
+ *
+ * Fused-Core Mode:
+ * 57:59 FusedQuadID
+ * 60    FusedCoreID
+ * 61:63 FusedThreadID
+ *
+ * FusedCoreID 0 contains normal-core chiplet 0 and 1
+ * FusedCoreID 1 contains normal-core chiplet 2 and 3
+ *
+ * Fused cores have interleaved threads:
+ * core chiplet 0/2 = t0, t2, t4, t6
+ * core chiplet 1/3 = t1, t3, t5, t7
+ *
  */
 #define P9_PIR2GCID(pir) (((pir) >> 8) & 0x7f)
 
@@ -102,6 +122,17 @@ 
 
 #define P9_GCID2CHIPID(gcid) ((gcid) & 0x7)
 
+#define P9_PIR2FUSEDQUADID(pir) (((pir) >> 4) & 0x7)
+
+#define P9_PIR2FUSEDCOREID(pir) (((pir) >> 3) & 0x1)
+
+#define P9_PIR2FUSEDTHREADID(pir) ((pir) & 0x7)
+
+#define P9_PIRFUSED2NORMALCOREID(pir) \
+	(P9_PIR2FUSEDQUADID(pir) << 2) | \
+	(P9_PIR2FUSEDCOREID(pir) << 1) | \
+	(P9_PIR2FUSEDTHREADID(pir) & 1)
+
 /* P9 specific ones mostly used by XIVE */
 #define P9_PIR2LOCALCPU(pir) ((pir) & 0xff)
 #define P9_PIRFROMLOCALCPU(chip, cpu)	(((chip) << 8) | (cpu))
diff --git a/include/cpu.h b/include/cpu.h
index 06d5c0d1..009ae52c 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -54,6 +54,7 @@  struct cpu_thread {
 	uint32_t			server_no;
 	uint32_t			chip_id;
 	bool				is_secondary;
+	bool				is_fused_core;
 	struct cpu_thread		*primary;
 	enum cpu_thread_state		state;
 	struct dt_node			*node;
@@ -251,6 +252,11 @@  static inline bool cpu_is_thread0(struct cpu_thread *cpu)
 	return cpu->primary == cpu;
 }
 
+static inline bool cpu_is_core_chiplet_primary(struct cpu_thread *cpu)
+{
+	return cpu->is_fused_core & (cpu_get_thread_index(cpu) == 1);
+}
+
 static inline bool cpu_is_sibling(struct cpu_thread *cpu1,
 				  struct cpu_thread *cpu2)
 {
diff --git a/include/xscom.h b/include/xscom.h
index 98532240..0885adf3 100644
--- a/include/xscom.h
+++ b/include/xscom.h
@@ -123,6 +123,9 @@ 
 
 /*
  * Additional useful definitions for P9
+ *
+ * Note: In all of these, the core numbering is the
+ * *normal* (small) core number.
  */
 
 /* An EQ is a quad (also named an EP) */