diff mbox

[5/5] powerpc/smp: Add Power9 scheduler topology

Message ID 20170302004920.21948-5-oohall@gmail.com (mailing list archive)
State Changes Requested
Headers show

Commit Message

Oliver O'Halloran March 2, 2017, 12:49 a.m. UTC
In previous generations of Power processors each core had a private L2
cache. The Power9 processor has a slightly different architecture where
the L2 cache is shared among pairs of cores rather than being completely
private.

Making the scheduler aware of this cache sharing allows the scheduler to
make more intelligent migration decisions. When one core in the pair is
overloaded tasks can be migrated to its paired core to improve throughput
without cache-refilling penality typically associated with task
migration.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/kernel/smp.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

Comments

Balbir Singh March 2, 2017, 10:25 a.m. UTC | #1
On 02/03/17 11:49, Oliver O'Halloran wrote:
> In previous generations of Power processors each core had a private L2
> cache. The Power9 processor has a slightly different architecture where
> the L2 cache is shared among pairs of cores rather than being completely
> private.
> 
> Making the scheduler aware of this cache sharing allows the scheduler to
> make more intelligent migration decisions. When one core in the pair is
> overloaded tasks can be migrated to its paired core to improve throughput
> without cache-refilling penality typically associated with task
> migration.

Could you please describe the changes to sched_domains w.r.t before and
after for P9.

I think most of the changes make sense, but some data to back them up
including any results you have to show that these patches help would be
nice

Balbir Singh.
Michael Ellerman March 15, 2017, 11:30 a.m. UTC | #2
Oliver O'Halloran <oohall@gmail.com> writes:

> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 5571f30ff72d..5e1811b24415 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -724,10 +724,17 @@ static void update_cpu_masks(int cpu, bool onlining)
>  
>  	update_thread_mask(cpu, onlining);
>  
> +	/* we need the l2 cache mask for the power9 scheduler topology */
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		update_mask_by_l2(cpu, onlining, cpu_cache_mask);

I guess I missed something somewhere, why do we need to check the CPU
feature?

...
> @@ -829,7 +862,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
>  
>  	dump_numa_cpu_topology();
>  
> -	set_sched_topology(powerpc_topology);
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		set_sched_topology(power9_topology);
> +	else
> +		set_sched_topology(powerpc_topology);
  
Ideally this would just all come from the device tree. If we detect that
the L2 is shared then we tell the scheduler that, we shouldn't need to
know that P9 is special. It certainly doesn't say anywhere in ISA 3.00
that the L2 is shared.

cheers
Michael Ellerman March 15, 2017, 11:33 a.m. UTC | #3
Balbir Singh <bsingharora@gmail.com> writes:

> On 02/03/17 11:49, Oliver O'Halloran wrote:
>> In previous generations of Power processors each core had a private L2
>> cache. The Power9 processor has a slightly different architecture where
>> the L2 cache is shared among pairs of cores rather than being completely
>> private.
>> 
>> Making the scheduler aware of this cache sharing allows the scheduler to
>> make more intelligent migration decisions. When one core in the pair is
>> overloaded tasks can be migrated to its paired core to improve throughput
>> without cache-refilling penality typically associated with task
>> migration.
>
> Could you please describe the changes to sched_domains w.r.t before and
> after for P9.

Yes please.

cheers
diff mbox

Patch

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5571f30ff72d..5e1811b24415 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -673,7 +673,7 @@  static struct device_node *cpu_to_l2cache(int cpu)
 	return cache;
 }
 
-static bool update_core_mask_by_l2(int cpu, bool onlining)
+static bool update_mask_by_l2(int cpu, bool onlining, struct cpumask *(*mask_fn)(int))
 {
 	const struct cpumask *mask = onlining ? cpu_online_mask : cpu_present_mask;
 	struct device_node *l2_cache, *np;
@@ -689,7 +689,7 @@  static bool update_core_mask_by_l2(int cpu, bool onlining)
 			continue;
 
 		if (np == l2_cache)
-			set_cpus_related(cpu, i, onlining, cpu_core_mask);
+			set_cpus_related(cpu, i, onlining, mask_fn);
 
 		of_node_put(np);
 	}
@@ -724,10 +724,17 @@  static void update_cpu_masks(int cpu, bool onlining)
 
 	update_thread_mask(cpu, onlining);
 
+	/* we need the l2 cache mask for the power9 scheduler topology */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		update_mask_by_l2(cpu, onlining, cpu_cache_mask);
+
+	/* now build the core mask */
+	set_cpus_related(cpu, cpu, onlining, cpu_core_mask);
+
 	if (update_core_mask_by_chip_id(cpu, onlining))
 		return;
 
-	if (update_core_mask_by_l2(cpu, onlining))
+	if (update_mask_by_l2(cpu, onlining, cpu_core_mask))
 		return;
 
 	/* if all else fails duplicate the sibling mask */
@@ -805,6 +812,32 @@  static struct sched_domain_topology_level powerpc_topology[] = {
 	{ NULL, },
 };
 
+
+/* P9 has a slightly odd architecture where two, four thread cores share an L2
+ * cache. For highly threaded workloads it makes sense to try and keep tasks
+ * inside the pair for better cache utilisation so the scheduler needs to be
+ * aware of this. */
+static int powerpc_shared_cache_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING;
+}
+
+/* this is kind of gross, but passing cpu_cache_mask directly
+ * causes the build to fail due to incompatible pointer types */
+static inline const struct cpumask *cpu_cache_mask_c(int cpu)
+{
+	return cpu_cache_mask(cpu);
+}
+
+static struct sched_domain_topology_level power9_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+	{ cpu_cache_mask_c, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	cpumask_var_t old_mask;
@@ -829,7 +862,10 @@  void __init smp_cpus_done(unsigned int max_cpus)
 
 	dump_numa_cpu_topology();
 
-	set_sched_topology(powerpc_topology);
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		set_sched_topology(power9_topology);
+	else
+		set_sched_topology(powerpc_topology);
 
 }