diff mbox series

[11/11] powerpc/smp: Add a doorbell=off kernel parameter

Message ID 20211105102636.1016378-12-clg@kaod.org (mailing list archive)
State Deferred
Headers show
Series powerpc/xive: Improve diagnostic and activate StoreEOI on P10 PowerNV | expand

Commit Message

Cédric Le Goater Nov. 5, 2021, 10:26 a.m. UTC
On processors with a XIVE interrupt controller (POWER9 and above), the
kernel can use either doorbells or XIVE to generate CPU IPIs. Sending
doorbell is generally preferred to using the XIVE IC because it is
faster. There are cases where we want to avoid doorbells and use XIVE
only, for debug or performance. Only useful on POWER9 and above.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
 arch/powerpc/include/asm/dbell.h                |  1 +
 arch/powerpc/kernel/dbell.c                     | 17 +++++++++++++++++
 arch/powerpc/platforms/powernv/smp.c            |  7 +++++--
 arch/powerpc/platforms/pseries/smp.c            |  3 +++
 Documentation/admin-guide/kernel-parameters.txt | 10 ++++++++++
 5 files changed, 36 insertions(+), 2 deletions(-)

Comments

Michael Ellerman Nov. 11, 2021, 10:41 a.m. UTC | #1
Cédric Le Goater <clg@kaod.org> writes:
> On processors with a XIVE interrupt controller (POWER9 and above), the
> kernel can use either doorbells or XIVE to generate CPU IPIs. Sending
> doorbell is generally preferred to using the XIVE IC because it is
> faster. There are cases where we want to avoid doorbells and use XIVE
> only, for debug or performance. Only useful on POWER9 and above.

How much do we want this?

Kernel command line args are a bit of a pain, they tend to be poorly
tested, because someone has to explicitly enable them at boot time, and
then reboot to test the other case.

When would we want to enable this? Can we make the kernel smarter about
when to use doorbells and make it automated?

Could we make it a runtime switch?

cheers

>
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
>  arch/powerpc/include/asm/dbell.h                |  1 +
>  arch/powerpc/kernel/dbell.c                     | 17 +++++++++++++++++
>  arch/powerpc/platforms/powernv/smp.c            |  7 +++++--
>  arch/powerpc/platforms/pseries/smp.c            |  3 +++
>  Documentation/admin-guide/kernel-parameters.txt | 10 ++++++++++
>  5 files changed, 36 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
> index 3e9da22a2779..07775aa3e81b 100644
> --- a/arch/powerpc/include/asm/dbell.h
> +++ b/arch/powerpc/include/asm/dbell.h
> @@ -90,6 +90,7 @@ static inline void ppc_msgsync(void)
>  #endif /* CONFIG_PPC_BOOK3S */
>  
>  extern void doorbell_exception(struct pt_regs *regs);
> +extern bool doorbell_disabled;
>  
>  static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
>  {
> diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
> index 5545c9cd17c1..681ee4775629 100644
> --- a/arch/powerpc/kernel/dbell.c
> +++ b/arch/powerpc/kernel/dbell.c
> @@ -38,6 +38,23 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception)
>  
>  	set_irq_regs(old_regs);
>  }
> +
> +bool doorbell_disabled;
> +
> +static int __init doorbell_cmdline(char *arg)
> +{
> +	if (!arg)
> +		return -EINVAL;
> +
> +	if (strncmp(arg, "off", 3) == 0) {
> +		pr_info("Doorbell disabled on kernel command line\n");
> +		doorbell_disabled = true;
> +	}
> +
> +	return 0;
> +}
> +__setup("doorbell=", doorbell_cmdline);
> +
>  #else /* CONFIG_SMP */
>  DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception)
>  {
> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
> index cbb67813cd5d..1311bda9446a 100644
> --- a/arch/powerpc/platforms/powernv/smp.c
> +++ b/arch/powerpc/platforms/powernv/smp.c
> @@ -338,10 +338,13 @@ static void __init pnv_smp_probe(void)
>  		ic_cause_ipi = smp_ops->cause_ipi;
>  		WARN_ON(!ic_cause_ipi);
>  
> -		if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> +			if (doorbell_disabled)
> +				return;
>  			smp_ops->cause_ipi = doorbell_global_ipi;
> -		else
> +		} else {
>  			smp_ops->cause_ipi = pnv_cause_ipi;
> +		}
>  	}
>  }
>  
> diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
> index f47429323eee..3bc9e6aaf645 100644
> --- a/arch/powerpc/platforms/pseries/smp.c
> +++ b/arch/powerpc/platforms/pseries/smp.c
> @@ -229,6 +229,9 @@ static __init void pSeries_smp_probe(void)
>  			return;
>  	}
>  
> +	if (doorbell_disabled)
> +		return;
> +
>  	/*
>  	 * Under PowerVM, FSCR[MSGP] is enabled as guest vCPU siblings are
>  	 * gang scheduled on the same physical core, so doorbells are always
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 10fa093251e8..2e1284febe39 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -1041,6 +1041,16 @@
>  			The filter can be disabled or changed to another
>  			driver later using sysfs.
>  
> +	doorbell=off	[PPC]
> +			On processors with a XIVE interrupt controller
> +			(POWER9 and above), the kernel can use either
> +			doorbells or XIVE to generate CPU IPIs.	Sending
> +			doorbell is generally preferred to using the XIVE
> +			IC because it is faster. There are cases where
> +			we want to avoid doorbells and use XIVE only,
> +			for debug or performance. Only useful on
> +			POWER9 and above.
> +
>  	driver_async_probe=  [KNL]
>  			List of driver names to be probed asynchronously.
>  			Format: <driver_name1>,<driver_name2>...
> -- 
> 2.31.1
Cédric Le Goater Nov. 11, 2021, 4:01 p.m. UTC | #2
On 11/11/21 11:41, Michael Ellerman wrote:
> Cédric Le Goater <clg@kaod.org> writes:
>> On processors with a XIVE interrupt controller (POWER9 and above), the
>> kernel can use either doorbells or XIVE to generate CPU IPIs. Sending
>> doorbell is generally preferred to using the XIVE IC because it is
>> faster. There are cases where we want to avoid doorbells and use XIVE
>> only, for debug or performance. Only useful on POWER9 and above.
> 
> How much do we want this?

Yes. Thanks for asking. It is a recent need.

Here is some background I should have added in the first place. May be
for a v2.

We have different ways of doing IPIs on POWER9 and above processors,
depending on the platform and the underlying hypervisor.

- PowerNV uses global doorbells

- pSeries/KVM uses XIVE only because local doorbells are not
   efficient, as there are emulated in the KVM hypervisor

- pSeries/PowerVM uses XIVE for remote cores and local doorbells for
   threads on same core (SMT4 or 8)

This recent commit 5b06d1679f2f ("powerpc/pseries: Use doorbells even
if XIVE is available") introduced the optimization for PowerVM and
commit 107c55005fbd ("powerpc/pseries: Add KVM guest doorbell
restrictions") restricted the optimization.

We would like a way to turn off the optimization.

> Kernel command line args are a bit of a pain, they tend to be poorly
> tested, because someone has to explicitly enable them at boot time,
> and then reboot to test the other case.

True. The "xive=off" parameter was poorly tested initially.

> When would we want to enable this?

For bring-up, for debug, for tests. I have been using a similar switch
to compare the XIVE interrupt controller performance with doorbells on
POWER9 and P0WER10.

A new need arises with PowerVM, some configurations will behave as KVM
(local doorbell are unsupported) and the doorbell=off parameter is a
simple way to handle this case today.

> Can we make the kernel smarter about when to use doorbells and make
> it automated?

I don't think we want to probe all IPI methods to detect how well
local doorbells are supported on the platform. Do we ?

A machine property/feature would be cleaner. It is a global CPU
property but I don't know where to put it. Ideas ?

> Could we make it a runtime switch?

We can. See the patch below. It covers the need for test/performance
but it won't work on a PowerVM system not supporting local doorbells
since boot will fail as soon as secondaries are started. We need a way
to take a decision early on which method to activate.


Thanks

C.

 From dcac8528c89b689217515032f3329ba5ea10085d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@kaod.org>
Date: Fri, 5 Nov 2021 12:23:48 +0100
Subject: [PATCH] powerpc/xive: Add a debugfs toggle to select xive for IPIs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For performance tests only.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---
  arch/powerpc/sysdev/xive/common.c | 26 ++++++++++++++++++++++++++
  1 file changed, 26 insertions(+)

diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index 39142df828a018..9ee36b95f9c545 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1826,6 +1826,30 @@ static int xive_eq_debug_show(struct seq_file *m, void *private)
  }
  DEFINE_SHOW_ATTRIBUTE(xive_eq_debug);
  
+static int xive_ipi_cause_debug_set(void *data, u64 val)
+{
+	static void (*do_ipi)(int cpu);
+
+	if (val) {
+		do_ipi = smp_ops->cause_ipi;
+		smp_ops->cause_ipi = xive_cause_ipi;
+	} else {
+		if (do_ipi)
+			smp_ops->cause_ipi = do_ipi;
+	}
+
+	return 0;
+}
+
+static int xive_ipi_cause_debug_get(void *data, u64 *val)
+{
+	*val = xive_cause_ipi == smp_ops->cause_ipi;
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(xive_ipi_cause_debug_fops, xive_ipi_cause_debug_get,
+			 xive_ipi_cause_debug_set, "%llu\n");
+
  static void xive_core_debugfs_create(void)
  {
  	struct dentry *xive_dir;
@@ -1849,6 +1873,8 @@ static void xive_core_debugfs_create(void)
  	}
  	debugfs_create_bool("store-eoi", 0600, xive_dir, &xive_store_eoi);
  	debugfs_create_bool("save-restore", 0600, xive_dir, &xive_has_save_restore);
+	debugfs_create_file("ipi-cause", 0600, xive_dir,
+			    NULL, &xive_ipi_cause_debug_fops);
  }
  
  #endif /* CONFIG_DEBUG_FS */
Michael Ellerman Nov. 18, 2021, 9:24 a.m. UTC | #3
Cédric Le Goater <clg@kaod.org> writes:
> On 11/11/21 11:41, Michael Ellerman wrote:
>> Cédric Le Goater <clg@kaod.org> writes:
>>> On processors with a XIVE interrupt controller (POWER9 and above), the
>>> kernel can use either doorbells or XIVE to generate CPU IPIs. Sending
>>> doorbell is generally preferred to using the XIVE IC because it is
>>> faster. There are cases where we want to avoid doorbells and use XIVE
>>> only, for debug or performance. Only useful on POWER9 and above.
>> 
>> How much do we want this?
>
> Yes. Thanks for asking. It is a recent need.
>
> Here is some background I should have added in the first place. May be
> for a v2.
>
> We have different ways of doing IPIs on POWER9 and above processors,
> depending on the platform and the underlying hypervisor.
>
> - PowerNV uses global doorbells
>
> - pSeries/KVM uses XIVE only because local doorbells are not
>    efficient, as there are emulated in the KVM hypervisor
>
> - pSeries/PowerVM uses XIVE for remote cores and local doorbells for
>    threads on same core (SMT4 or 8)
>
> This recent commit 5b06d1679f2f ("powerpc/pseries: Use doorbells even
> if XIVE is available") introduced the optimization for PowerVM and
> commit 107c55005fbd ("powerpc/pseries: Add KVM guest doorbell
> restrictions") restricted the optimization.
>
> We would like a way to turn off the optimization.

Just for test/debug though?

>> Kernel command line args are a bit of a pain, they tend to be poorly
>> tested, because someone has to explicitly enable them at boot time,
>> and then reboot to test the other case.
>
> True. The "xive=off" parameter was poorly tested initially.
>
>> When would we want to enable this?
>
> For bring-up, for debug, for tests. I have been using a similar switch
> to compare the XIVE interrupt controller performance with doorbells on
> POWER9 and P0WER10.
>
> A new need arises with PowerVM, some configurations will behave as KVM
> (local doorbell are unsupported) and the doorbell=off parameter is a
> simple way to handle this case today.

That's the first I've heard of that, what PowerVM configurations have
non-working doorbells?

>> Can we make the kernel smarter about when to use doorbells and make
>> it automated?
>
> I don't think we want to probe all IPI methods to detect how well
> local doorbells are supported on the platform. Do we ?

We don't *want to*, but sounds like we might have to if they are not
working in some configurations as you mentioned above.

cheers
Cédric Le Goater Nov. 18, 2021, 3:26 p.m. UTC | #4
On 11/18/21 10:24, Michael Ellerman wrote:
> Cédric Le Goater <clg@kaod.org> writes:
>> On 11/11/21 11:41, Michael Ellerman wrote:
>>> Cédric Le Goater <clg@kaod.org> writes:
>>>> On processors with a XIVE interrupt controller (POWER9 and above), the
>>>> kernel can use either doorbells or XIVE to generate CPU IPIs. Sending
>>>> doorbell is generally preferred to using the XIVE IC because it is
>>>> faster. There are cases where we want to avoid doorbells and use XIVE
>>>> only, for debug or performance. Only useful on POWER9 and above.
>>>
>>> How much do we want this?
>>
>> Yes. Thanks for asking. It is a recent need.
>>
>> Here is some background I should have added in the first place. May be
>> for a v2.
>>
>> We have different ways of doing IPIs on POWER9 and above processors,
>> depending on the platform and the underlying hypervisor.
>>
>> - PowerNV uses global doorbells
>>
>> - pSeries/KVM uses XIVE only because local doorbells are not
>>     efficient, as there are emulated in the KVM hypervisor
>>
>> - pSeries/PowerVM uses XIVE for remote cores and local doorbells for
>>     threads on same core (SMT4 or 8)
>>
>> This recent commit 5b06d1679f2f ("powerpc/pseries: Use doorbells even
>> if XIVE is available") introduced the optimization for PowerVM and
>> commit 107c55005fbd ("powerpc/pseries: Add KVM guest doorbell
>> restrictions") restricted the optimization.
>>
>> We would like a way to turn off the optimization.
> 
> Just for test/debug though?

Yes. For now, this is the main target.

>>> Kernel command line args are a bit of a pain, they tend to be poorly
>>> tested, because someone has to explicitly enable them at boot time,
>>> and then reboot to test the other case.
>>
>> True. The "xive=off" parameter was poorly tested initially.
>>
>>> When would we want to enable this?
>>
>> For bring-up, for debug, for tests. I have been using a similar switch
>> to compare the XIVE interrupt controller performance with doorbells on
>> POWER9 and P0WER10.
>>
>> A new need arises with PowerVM, some configurations will behave as KVM
>> (local doorbell are unsupported) and the doorbell=off parameter is a
>> simple way to handle this case today.
> 
> That's the first I've heard of that, what PowerVM configurations have
> non-working doorbells?

It's not released yet.

>>> Can we make the kernel smarter about when to use doorbells and make
>>> it automated?
>>
>> I don't think we want to probe all IPI methods to detect how well
>> local doorbells are supported on the platform. Do we ?
> 
> We don't *want to*, but sounds like we might have to if they are not
> working in some configurations as you mentioned above.

Yeah. This is too early. I will keep that patch for internal use
until we have clarified.

Thanks,

C.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 3e9da22a2779..07775aa3e81b 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -90,6 +90,7 @@  static inline void ppc_msgsync(void)
 #endif /* CONFIG_PPC_BOOK3S */
 
 extern void doorbell_exception(struct pt_regs *regs);
+extern bool doorbell_disabled;
 
 static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
 {
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 5545c9cd17c1..681ee4775629 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -38,6 +38,23 @@  DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception)
 
 	set_irq_regs(old_regs);
 }
+
+bool doorbell_disabled;
+
+static int __init doorbell_cmdline(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	if (strncmp(arg, "off", 3) == 0) {
+		pr_info("Doorbell disabled on kernel command line\n");
+		doorbell_disabled = true;
+	}
+
+	return 0;
+}
+__setup("doorbell=", doorbell_cmdline);
+
 #else /* CONFIG_SMP */
 DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception)
 {
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index cbb67813cd5d..1311bda9446a 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -338,10 +338,13 @@  static void __init pnv_smp_probe(void)
 		ic_cause_ipi = smp_ops->cause_ipi;
 		WARN_ON(!ic_cause_ipi);
 
-		if (cpu_has_feature(CPU_FTR_ARCH_300))
+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+			if (doorbell_disabled)
+				return;
 			smp_ops->cause_ipi = doorbell_global_ipi;
-		else
+		} else {
 			smp_ops->cause_ipi = pnv_cause_ipi;
+		}
 	}
 }
 
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index f47429323eee..3bc9e6aaf645 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -229,6 +229,9 @@  static __init void pSeries_smp_probe(void)
 			return;
 	}
 
+	if (doorbell_disabled)
+		return;
+
 	/*
 	 * Under PowerVM, FSCR[MSGP] is enabled as guest vCPU siblings are
 	 * gang scheduled on the same physical core, so doorbells are always
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 10fa093251e8..2e1284febe39 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1041,6 +1041,16 @@ 
 			The filter can be disabled or changed to another
 			driver later using sysfs.
 
+	doorbell=off	[PPC]
+			On processors with a XIVE interrupt controller
+			(POWER9 and above), the kernel can use either
+			doorbells or XIVE to generate CPU IPIs.	Sending
+			doorbell is generally preferred to using the XIVE
+			IC because it is faster. There are cases where
+			we want to avoid doorbells and use XIVE only,
+			for debug or performance. Only useful on
+			POWER9 and above.
+
 	driver_async_probe=  [KNL]
 			List of driver names to be probed asynchronously.
 			Format: <driver_name1>,<driver_name2>...