diff mbox series

[v3,5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR

Message ID 20200706043540.1563616-6-npiggin@gmail.com (mailing list archive)
State Changes Requested
Headers show
Series powerpc: queued spinlocks and rwlocks | expand

Commit Message

Nicholas Piggin July 6, 2020, 4:35 a.m. UTC
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/paravirt.h           | 28 ++++++++
 arch/powerpc/include/asm/qspinlock.h          | 66 +++++++++++++++++++
 arch/powerpc/include/asm/qspinlock_paravirt.h |  7 ++
 arch/powerpc/platforms/pseries/Kconfig        |  5 ++
 arch/powerpc/platforms/pseries/setup.c        |  6 +-
 include/asm-generic/qspinlock.h               |  2 +
 6 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/qspinlock_paravirt.h

Comments

Michael Ellerman July 9, 2020, 10:53 a.m. UTC | #1
Nicholas Piggin <npiggin@gmail.com> writes:

> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/powerpc/include/asm/paravirt.h           | 28 ++++++++
>  arch/powerpc/include/asm/qspinlock.h          | 66 +++++++++++++++++++
>  arch/powerpc/include/asm/qspinlock_paravirt.h |  7 ++
>  arch/powerpc/platforms/pseries/Kconfig        |  5 ++
>  arch/powerpc/platforms/pseries/setup.c        |  6 +-
>  include/asm-generic/qspinlock.h               |  2 +

Another ack?

> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
> index 7a8546660a63..f2d51f929cf5 100644
> --- a/arch/powerpc/include/asm/paravirt.h
> +++ b/arch/powerpc/include/asm/paravirt.h
> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
>  {
>  	___bad_yield_to_preempted(); /* This would be a bug */
>  }
> +
> +extern void ___bad_yield_to_any(void);
> +static inline void yield_to_any(void)
> +{
> +	___bad_yield_to_any(); /* This would be a bug */
> +}

Why do we do that rather than just not defining yield_to_any() at all
and letting the build fail on that?

There's a condition somewhere that we know will false at compile time
and drop the call before linking?

> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
> new file mode 100644
> index 000000000000..750d1b5e0202
> --- /dev/null
> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
> @@ -0,0 +1,7 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
> +#define __ASM_QSPINLOCK_PARAVIRT_H

_ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.

> +
> +EXPORT_SYMBOL(__pv_queued_spin_unlock);

Why's that in a header? Should that (eventually) go with the generic implementation?

> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
> index 24c18362e5ea..756e727b383f 100644
> --- a/arch/powerpc/platforms/pseries/Kconfig
> +++ b/arch/powerpc/platforms/pseries/Kconfig
> @@ -25,9 +25,14 @@ config PPC_PSERIES
>  	select SWIOTLB
>  	default y
>  
> +config PARAVIRT_SPINLOCKS
> +	bool
> +	default n

default n is the default.

> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 2db8469e475f..747a203d9453 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
>  	if (firmware_has_feature(FW_FEATURE_LPAR)) {
>  		vpa_init(boot_cpuid);
>  
> -		if (lppaca_shared_proc(get_lppaca()))
> +		if (lppaca_shared_proc(get_lppaca())) {
>  			static_branch_enable(&shared_processor);
> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
> +			pv_spinlocks_init();
> +#endif
> +		}

We could avoid the ifdef with this I think?

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 434615f1d761..6ec72282888d 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -10,5 +10,9 @@
 #include <asm/simple_spinlock.h>
 #endif

+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+static inline void pv_spinlocks_init(void) { }
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */


cheers
Peter Zijlstra July 9, 2020, 11:03 a.m. UTC | #2
On Thu, Jul 09, 2020 at 08:53:16PM +1000, Michael Ellerman wrote:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
> > Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> > ---
> >  arch/powerpc/include/asm/paravirt.h           | 28 ++++++++
> >  arch/powerpc/include/asm/qspinlock.h          | 66 +++++++++++++++++++
> >  arch/powerpc/include/asm/qspinlock_paravirt.h |  7 ++
> >  arch/powerpc/platforms/pseries/Kconfig        |  5 ++
> >  arch/powerpc/platforms/pseries/setup.c        |  6 +-
> >  include/asm-generic/qspinlock.h               |  2 +
> 
> Another ack?

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Waiman Long July 9, 2020, 4:06 p.m. UTC | #3
On 7/9/20 6:53 AM, Michael Ellerman wrote:
> Nicholas Piggin <npiggin@gmail.com> writes:
>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>   arch/powerpc/include/asm/paravirt.h           | 28 ++++++++
>>   arch/powerpc/include/asm/qspinlock.h          | 66 +++++++++++++++++++
>>   arch/powerpc/include/asm/qspinlock_paravirt.h |  7 ++
>>   arch/powerpc/platforms/pseries/Kconfig        |  5 ++
>>   arch/powerpc/platforms/pseries/setup.c        |  6 +-
>>   include/asm-generic/qspinlock.h               |  2 +
> Another ack?
>
I am OK with adding the #ifdef around queued_spin_lock().

Acked-by: Waiman Long <longman@redhat.com>

>> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
>> index 7a8546660a63..f2d51f929cf5 100644
>> --- a/arch/powerpc/include/asm/paravirt.h
>> +++ b/arch/powerpc/include/asm/paravirt.h
>> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
>>   {
>>   	___bad_yield_to_preempted(); /* This would be a bug */
>>   }
>> +
>> +extern void ___bad_yield_to_any(void);
>> +static inline void yield_to_any(void)
>> +{
>> +	___bad_yield_to_any(); /* This would be a bug */
>> +}
> Why do we do that rather than just not defining yield_to_any() at all
> and letting the build fail on that?
>
> There's a condition somewhere that we know will false at compile time
> and drop the call before linking?
>
>> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> new file mode 100644
>> index 000000000000..750d1b5e0202
>> --- /dev/null
>> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> @@ -0,0 +1,7 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
>> +#define __ASM_QSPINLOCK_PARAVIRT_H
> _ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.
>
>> +
>> +EXPORT_SYMBOL(__pv_queued_spin_unlock);
> Why's that in a header? Should that (eventually) go with the generic implementation?
The PV qspinlock implementation is not that generic at the moment. Even 
though native qspinlock is used by a number of archs, PV qspinlock is 
only currently used in x86. This is certainly an area that needs 
improvement.
>> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
>> index 24c18362e5ea..756e727b383f 100644
>> --- a/arch/powerpc/platforms/pseries/Kconfig
>> +++ b/arch/powerpc/platforms/pseries/Kconfig
>> @@ -25,9 +25,14 @@ config PPC_PSERIES
>>   	select SWIOTLB
>>   	default y
>>   
>> +config PARAVIRT_SPINLOCKS
>> +	bool
>> +	default n
> default n is the default.
>
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index 2db8469e475f..747a203d9453 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
>>   	if (firmware_has_feature(FW_FEATURE_LPAR)) {
>>   		vpa_init(boot_cpuid);
>>   
>> -		if (lppaca_shared_proc(get_lppaca()))
>> +		if (lppaca_shared_proc(get_lppaca())) {
>>   			static_branch_enable(&shared_processor);
>> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
>> +			pv_spinlocks_init();
>> +#endif
>> +		}
> We could avoid the ifdef with this I think?
>
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 434615f1d761..6ec72282888d 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -10,5 +10,9 @@
>   #include <asm/simple_spinlock.h>
>   #endif
>
> +#ifndef CONFIG_PARAVIRT_SPINLOCKS
> +static inline void pv_spinlocks_init(void) { }
> +#endif
> +
>   #endif /* __KERNEL__ */
>   #endif /* __ASM_SPINLOCK_H */
>
>
> cheers
>
We don't really need to do a pv_spinlocks_init() if pv_kick() isn't 
supported.

Cheers,
Longman
Peter Zijlstra July 23, 2020, 2 p.m. UTC | #4
On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote:
> We don't really need to do a pv_spinlocks_init() if pv_kick() isn't
> supported.

Waiman, if you cannot explain how not having kick is a sane thing, what
are you saying here?
Nicholas Piggin July 23, 2020, 2:09 p.m. UTC | #5
Excerpts from Michael Ellerman's message of July 9, 2020 8:53 pm:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/powerpc/include/asm/paravirt.h           | 28 ++++++++
>>  arch/powerpc/include/asm/qspinlock.h          | 66 +++++++++++++++++++
>>  arch/powerpc/include/asm/qspinlock_paravirt.h |  7 ++
>>  arch/powerpc/platforms/pseries/Kconfig        |  5 ++
>>  arch/powerpc/platforms/pseries/setup.c        |  6 +-
>>  include/asm-generic/qspinlock.h               |  2 +
> 
> Another ack?
> 
>> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
>> index 7a8546660a63..f2d51f929cf5 100644
>> --- a/arch/powerpc/include/asm/paravirt.h
>> +++ b/arch/powerpc/include/asm/paravirt.h
>> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
>>  {
>>  	___bad_yield_to_preempted(); /* This would be a bug */
>>  }
>> +
>> +extern void ___bad_yield_to_any(void);
>> +static inline void yield_to_any(void)
>> +{
>> +	___bad_yield_to_any(); /* This would be a bug */
>> +}
> 
> Why do we do that rather than just not defining yield_to_any() at all
> and letting the build fail on that?
> 
> There's a condition somewhere that we know will false at compile time
> and drop the call before linking?

Mainly so you could use it in if (IS_ENABLED()) blocks, but would still
catch the (presumably buggy) case where something calls it without the
option set.

I think I had it arranged a different way that was using IS_ENABLED 
earlier and changed it but might as well keep it this way.

> 
>> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> new file mode 100644
>> index 000000000000..750d1b5e0202
>> --- /dev/null
>> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> @@ -0,0 +1,7 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
>> +#define __ASM_QSPINLOCK_PARAVIRT_H
> 
> _ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.
> 
>> +
>> +EXPORT_SYMBOL(__pv_queued_spin_unlock);
> 
> Why's that in a header? Should that (eventually) go with the generic implementation?

Yeah the qspinlock_paravirt.h header is a bit weird and only gets 
included into kernel/locking/qspinlock.c

>> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
>> index 24c18362e5ea..756e727b383f 100644
>> --- a/arch/powerpc/platforms/pseries/Kconfig
>> +++ b/arch/powerpc/platforms/pseries/Kconfig
>> @@ -25,9 +25,14 @@ config PPC_PSERIES
>>  	select SWIOTLB
>>  	default y
>>  
>> +config PARAVIRT_SPINLOCKS
>> +	bool
>> +	default n
> 
> default n is the default.
> 
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index 2db8469e475f..747a203d9453 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
>>  	if (firmware_has_feature(FW_FEATURE_LPAR)) {
>>  		vpa_init(boot_cpuid);
>>  
>> -		if (lppaca_shared_proc(get_lppaca()))
>> +		if (lppaca_shared_proc(get_lppaca())) {
>>  			static_branch_enable(&shared_processor);
>> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
>> +			pv_spinlocks_init();
>> +#endif
>> +		}
> 
> We could avoid the ifdef with this I think?

Yes I think so.

Thanks,
Nick
Waiman Long July 23, 2020, 6:32 p.m. UTC | #6
On 7/23/20 10:00 AM, Peter Zijlstra wrote:
> On Thu, Jul 09, 2020 at 12:06:13PM -0400, Waiman Long wrote:
>> We don't really need to do a pv_spinlocks_init() if pv_kick() isn't
>> supported.
> Waiman, if you cannot explain how not having kick is a sane thing, what
> are you saying here?
>
The current PPC paravirt spinlock code doesn't do any cpu kick. It does 
an equivalence of pv_wait by yielding the cpu to the lock holder only. 
The pv_spinlocks_init() is for setting up the hash table for doing 
pv_kick. If we don't need to do pv_kick, we don't need the hash table.

I am not saying that pv_kick is not needed for the PPC environment. I 
was just trying to adapt the pvqspinlock code to such an environment 
first. Further investigation on how to implement some kind of pv_kick 
will be something that we may want to do as a follow on.

BTW, do you have any comment on my v2 lock holder cpu info qspinlock 
patch? I will have to update the patch to fix the reported 0-day test 
problem, but I want to collect other feedback before sending out v3.

Cheers,
Longman
Peter Zijlstra July 23, 2020, 6:47 p.m. UTC | #7
On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> I will have to update the patch to fix the reported 0-day test problem, but
> I want to collect other feedback before sending out v3.

I want to say I hate it all, it adds instructions to a path we spend an
aweful lot of time optimizing without really getting anything back for
it.

Will, how do you feel about it?
Waiman Long July 23, 2020, 7:04 p.m. UTC | #8
On 7/23/20 2:47 PM, peterz@infradead.org wrote:
> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
>> I will have to update the patch to fix the reported 0-day test problem, but
>> I want to collect other feedback before sending out v3.
> I want to say I hate it all, it adds instructions to a path we spend an
> aweful lot of time optimizing without really getting anything back for
> it.

It does add some extra instruction that may slow it down slightly, but I 
don't agree that it gives nothing back. The cpu lock holder information 
can be useful in analyzing crash dumps and in some debugging situation. 
I think it can be useful in RHEL for this readon. How about an x86 
config option to allow distros to decide if they want to have it 
enabled? I will make sure that it will have no performance degradation 
if the option is not enabled.

Cheers,
Longman
Peter Zijlstra July 23, 2020, 7:58 p.m. UTC | #9
On Thu, Jul 23, 2020 at 03:04:13PM -0400, Waiman Long wrote:
> On 7/23/20 2:47 PM, peterz@infradead.org wrote:
> > On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> > > BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> > > I will have to update the patch to fix the reported 0-day test problem, but
> > > I want to collect other feedback before sending out v3.
> > I want to say I hate it all, it adds instructions to a path we spend an
> > aweful lot of time optimizing without really getting anything back for
> > it.
> 
> It does add some extra instruction that may slow it down slightly, but I
> don't agree that it gives nothing back. The cpu lock holder information can
> be useful in analyzing crash dumps and in some debugging situation. I think
> it can be useful in RHEL for this readon. How about an x86 config option to
> allow distros to decide if they want to have it enabled? I will make sure
> that it will have no performance degradation if the option is not enabled.

Config knobs suck too; they create a maintenance burden (we get to make
sure all the permutations works/build/etc..) and effectively nobody uses
them, since world+dog uses what distros pick.

Anyway, instead of adding a second per-cpu variable, can you see how
horrible something like this is:

unsigned char adds(unsigned char var, unsigned char val)
{
	unsigned short sat = 0xff, tmp = var;

	asm ("addb	%[val], %b[var];"
	     "cmovc	%[sat], %[var];"
	     : [var] "+r" (tmp)
	     : [val] "ir" (val), [sat] "r" (sat)
	     );

	return tmp;
}

Another thing to try is, instead of threading that lockval throughout
the thing, simply:

#define _Q_LOCKED_VAL	this_cpu_read_stable(cpu_sat)

or combined with the above

#define _Q_LOCKED_VAL	adds(this_cpu_read_stable(cpu_number), 2)

and see if the compiler really makes a mess of things.
Segher Boessenkool July 23, 2020, 8:30 p.m. UTC | #10
On Thu, Jul 23, 2020 at 09:58:55PM +0200, peterz@infradead.org wrote:
> 	asm ("addb	%[val], %b[var];"
> 	     "cmovc	%[sat], %[var];"
> 	     : [var] "+r" (tmp)
> 	     : [val] "ir" (val), [sat] "r" (sat)
> 	     );

"var" (operand 0) needs an earlyclobber ("sat" is read after "var" is
written for the first time).


Segher
Waiman Long July 23, 2020, 9:58 p.m. UTC | #11
On 7/23/20 3:58 PM, peterz@infradead.org wrote:
> On Thu, Jul 23, 2020 at 03:04:13PM -0400, Waiman Long wrote:
>> On 7/23/20 2:47 PM, peterz@infradead.org wrote:
>>> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>>>> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
>>>> I will have to update the patch to fix the reported 0-day test problem, but
>>>> I want to collect other feedback before sending out v3.
>>> I want to say I hate it all, it adds instructions to a path we spend an
>>> aweful lot of time optimizing without really getting anything back for
>>> it.
>> It does add some extra instruction that may slow it down slightly, but I
>> don't agree that it gives nothing back. The cpu lock holder information can
>> be useful in analyzing crash dumps and in some debugging situation. I think
>> it can be useful in RHEL for this readon. How about an x86 config option to
>> allow distros to decide if they want to have it enabled? I will make sure
>> that it will have no performance degradation if the option is not enabled.
> Config knobs suck too; they create a maintenance burden (we get to make
> sure all the permutations works/build/etc..) and effectively nobody uses
> them, since world+dog uses what distros pick.
>
> Anyway, instead of adding a second per-cpu variable, can you see how
> horrible something like this is:
>
> unsigned char adds(unsigned char var, unsigned char val)
> {
> 	unsigned short sat = 0xff, tmp = var;
>
> 	asm ("addb	%[val], %b[var];"
> 	     "cmovc	%[sat], %[var];"
> 	     : [var] "+r" (tmp)
> 	     : [val] "ir" (val), [sat] "r" (sat)
> 	     );
>
> 	return tmp;
> }
>
> Another thing to try is, instead of threading that lockval throughout
> the thing, simply:
>
> #define _Q_LOCKED_VAL	this_cpu_read_stable(cpu_sat)
>
> or combined with the above
>
> #define _Q_LOCKED_VAL	adds(this_cpu_read_stable(cpu_number), 2)
>
> and see if the compiler really makes a mess of things.
>
Thanks for the suggestion. I will try that out.

Cheers,
Longman
Will Deacon July 24, 2020, 8:16 a.m. UTC | #12
On Thu, Jul 23, 2020 at 08:47:59PM +0200, peterz@infradead.org wrote:
> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> > BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> > I will have to update the patch to fix the reported 0-day test problem, but
> > I want to collect other feedback before sending out v3.
> 
> I want to say I hate it all, it adds instructions to a path we spend an
> aweful lot of time optimizing without really getting anything back for
> it.
> 
> Will, how do you feel about it?

I can see it potentially being useful for debugging, but I hate the
limitation to 256 CPUs. Even arm64 is hitting that now.

Also, you're talking ~1% gains here. I think our collective time would
be better spent off reviewing the CNA series and trying to make it more
deterministic.

Will
Waiman Long July 24, 2020, 7:10 p.m. UTC | #13
On 7/24/20 4:16 AM, Will Deacon wrote:
> On Thu, Jul 23, 2020 at 08:47:59PM +0200, peterz@infradead.org wrote:
>> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>>> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
>>> I will have to update the patch to fix the reported 0-day test problem, but
>>> I want to collect other feedback before sending out v3.
>> I want to say I hate it all, it adds instructions to a path we spend an
>> aweful lot of time optimizing without really getting anything back for
>> it.
>>
>> Will, how do you feel about it?
> I can see it potentially being useful for debugging, but I hate the
> limitation to 256 CPUs. Even arm64 is hitting that now.

After thinking more about that, I think we can use all the remaining 
bits in the 16-bit locked_pending. Reserving 1 bit for locked and 1 bit 
for pending, there are 14 bits left. So as long as NR_CPUS < 16k 
(requirement for 16-bit locked_pending), we can put all possible cpu 
numbers into the lock. We can also just use smp_processor_id() without 
additional percpu data.

>
> Also, you're talking ~1% gains here. I think our collective time would
> be better spent off reviewing the CNA series and trying to make it more
> deterministic.

I thought you guys are not interested in CNA. I do want to get CNA 
merged, if possible. Let review the current version again and see if 
there are ways we can further improve it.

Cheers,
Longman
Waiman Long July 25, 2020, 3:02 a.m. UTC | #14
On 7/24/20 3:10 PM, Waiman Long wrote:
> On 7/24/20 4:16 AM, Will Deacon wrote:
>> On Thu, Jul 23, 2020 at 08:47:59PM +0200, peterz@infradead.org wrote:
>>> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>>>> BTW, do you have any comment on my v2 lock holder cpu info 
>>>> qspinlock patch?
>>>> I will have to update the patch to fix the reported 0-day test 
>>>> problem, but
>>>> I want to collect other feedback before sending out v3.
>>> I want to say I hate it all, it adds instructions to a path we spend an
>>> aweful lot of time optimizing without really getting anything back for
>>> it.
>>>
>>> Will, how do you feel about it?
>> I can see it potentially being useful for debugging, but I hate the
>> limitation to 256 CPUs. Even arm64 is hitting that now.
>
> After thinking more about that, I think we can use all the remaining 
> bits in the 16-bit locked_pending. Reserving 1 bit for locked and 1 
> bit for pending, there are 14 bits left. So as long as NR_CPUS < 16k 
> (requirement for 16-bit locked_pending), we can put all possible cpu 
> numbers into the lock. We can also just use smp_processor_id() without 
> additional percpu data. 

Sorry, that doesn't work. The extra bits in the pending byte won't get 
cleared on unlock. That will have noticeable performance impact. 
Clearing the pending byte on unlock will cause other performance 
problem. So I guess we will have to limit the cpu number in the locked byte.

Regards,
Longman
Peter Zijlstra July 25, 2020, 5:26 p.m. UTC | #15
On Fri, Jul 24, 2020 at 03:10:59PM -0400, Waiman Long wrote:
> On 7/24/20 4:16 AM, Will Deacon wrote:
> > On Thu, Jul 23, 2020 at 08:47:59PM +0200, peterz@infradead.org wrote:
> > > On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
> > > > BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
> > > > I will have to update the patch to fix the reported 0-day test problem, but
> > > > I want to collect other feedback before sending out v3.
> > > I want to say I hate it all, it adds instructions to a path we spend an
> > > aweful lot of time optimizing without really getting anything back for
> > > it.
> > > 
> > > Will, how do you feel about it?
> > I can see it potentially being useful for debugging, but I hate the
> > limitation to 256 CPUs. Even arm64 is hitting that now.
> 
> After thinking more about that, I think we can use all the remaining bits in
> the 16-bit locked_pending. Reserving 1 bit for locked and 1 bit for pending,
> there are 14 bits left. So as long as NR_CPUS < 16k (requirement for 16-bit
> locked_pending), we can put all possible cpu numbers into the lock. We can
> also just use smp_processor_id() without additional percpu data.

That sounds horrific, wouldn't that destroy the whole point of using a
byte for pending?

> > Also, you're talking ~1% gains here. I think our collective time would
> > be better spent off reviewing the CNA series and trying to make it more
> > deterministic.
> 
> I thought you guys are not interested in CNA. I do want to get CNA merged,
> if possible. Let review the current version again and see if there are ways
> we can further improve it.

It's not a lack of interrest. We were struggling with the fairness
issues and the complexity of the thing. I forgot the current state of
matters, but at one point UNLOCK was O(n) in waiters, which is, of
course, 'unfortunate'.

I'll have to look up whatever notes remain, but the basic idea of
keeping remote nodes on a secondary list is obviously breaking all sorts
of fairness. After that they pile on a bunch of hacks to fix the worst
of them, but it feels exactly like that, a bunch of hacks.

One of the things I suppose we ought to do is see if some of the ideas
of phase-fair locks can be applied to this.

That coupled with a chronic lack of time for anything :-(
Waiman Long July 25, 2020, 5:36 p.m. UTC | #16
On 7/25/20 1:26 PM, Peter Zijlstra wrote:
> On Fri, Jul 24, 2020 at 03:10:59PM -0400, Waiman Long wrote:
>> On 7/24/20 4:16 AM, Will Deacon wrote:
>>> On Thu, Jul 23, 2020 at 08:47:59PM +0200, peterz@infradead.org wrote:
>>>> On Thu, Jul 23, 2020 at 02:32:36PM -0400, Waiman Long wrote:
>>>>> BTW, do you have any comment on my v2 lock holder cpu info qspinlock patch?
>>>>> I will have to update the patch to fix the reported 0-day test problem, but
>>>>> I want to collect other feedback before sending out v3.
>>>> I want to say I hate it all, it adds instructions to a path we spend an
>>>> aweful lot of time optimizing without really getting anything back for
>>>> it.
>>>>
>>>> Will, how do you feel about it?
>>> I can see it potentially being useful for debugging, but I hate the
>>> limitation to 256 CPUs. Even arm64 is hitting that now.
>> After thinking more about that, I think we can use all the remaining bits in
>> the 16-bit locked_pending. Reserving 1 bit for locked and 1 bit for pending,
>> there are 14 bits left. So as long as NR_CPUS < 16k (requirement for 16-bit
>> locked_pending), we can put all possible cpu numbers into the lock. We can
>> also just use smp_processor_id() without additional percpu data.
> That sounds horrific, wouldn't that destroy the whole point of using a
> byte for pending?
You are right. I realized that later on and had sent a follow-up mail to 
correct that.
>>> Also, you're talking ~1% gains here. I think our collective time would
>>> be better spent off reviewing the CNA series and trying to make it more
>>> deterministic.
>> I thought you guys are not interested in CNA. I do want to get CNA merged,
>> if possible. Let review the current version again and see if there are ways
>> we can further improve it.
> It's not a lack of interrest. We were struggling with the fairness
> issues and the complexity of the thing. I forgot the current state of
> matters, but at one point UNLOCK was O(n) in waiters, which is, of
> course, 'unfortunate'.
>
> I'll have to look up whatever notes remain, but the basic idea of
> keeping remote nodes on a secondary list is obviously breaking all sorts
> of fairness. After that they pile on a bunch of hacks to fix the worst
> of them, but it feels exactly like that, a bunch of hacks.
>
> One of the things I suppose we ought to do is see if some of the ideas
> of phase-fair locks can be applied to this.
That could be a possible solution to ensure better fairness.
>
> That coupled with a chronic lack of time for anything :-(
>
That is always true and I feel this way too:-)

Cheers,
Longman
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
index 7a8546660a63..f2d51f929cf5 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -29,6 +29,16 @@  static inline void yield_to_preempted(int cpu, u32 yield_count)
 {
 	plpar_hcall_norets(H_CONFER, get_hard_smp_processor_id(cpu), yield_count);
 }
+
+static inline void prod_cpu(int cpu)
+{
+	plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu));
+}
+
+static inline void yield_to_any(void)
+{
+	plpar_hcall_norets(H_CONFER, -1, 0);
+}
 #else
 static inline bool is_shared_processor(void)
 {
@@ -45,6 +55,19 @@  static inline void yield_to_preempted(int cpu, u32 yield_count)
 {
 	___bad_yield_to_preempted(); /* This would be a bug */
 }
+
+extern void ___bad_yield_to_any(void);
+static inline void yield_to_any(void)
+{
+	___bad_yield_to_any(); /* This would be a bug */
+}
+
+extern void ___bad_prod_cpu(void);
+static inline void prod_cpu(int cpu)
+{
+	___bad_prod_cpu(); /* This would be a bug */
+}
+
 #endif
 
 #define vcpu_is_preempted vcpu_is_preempted
@@ -57,5 +80,10 @@  static inline bool vcpu_is_preempted(int cpu)
 	return false;
 }
 
+static inline bool pv_is_native_spin_unlock(void)
+{
+     return !is_shared_processor();
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_PARAVIRT_H */
diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index c49e33e24edd..f5066f00a08c 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -3,9 +3,47 @@ 
 #define _ASM_POWERPC_QSPINLOCK_H
 
 #include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
 
 #define _Q_PENDING_LOOPS	(1 << 9) /* not tuned */
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
+
+static __always_inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	if (!is_shared_processor())
+		native_queued_spin_lock_slowpath(lock, val);
+	else
+		__pv_queued_spin_lock_slowpath(lock, val);
+}
+
+#define queued_spin_unlock queued_spin_unlock
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	if (!is_shared_processor())
+		smp_store_release(&lock->locked, 0);
+	else
+		__pv_queued_spin_unlock(lock);
+}
+
+#else
+extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+#endif
+
+static __always_inline void queued_spin_lock(struct qspinlock *lock)
+{
+	u32 val = 0;
+
+	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
+		return;
+
+	queued_spin_lock_slowpath(lock, val);
+}
+#define queued_spin_lock queued_spin_lock
+
 #define smp_mb__after_spinlock()   smp_mb()
 
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
@@ -20,6 +58,34 @@  static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 }
 #define queued_spin_is_locked queued_spin_is_locked
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#define SPIN_THRESHOLD (1<<15) /* not tuned */
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+	if (*ptr != val)
+		return;
+	yield_to_any();
+	/*
+	 * We could pass in a CPU here if waiting in the queue and yield to
+	 * the previous CPU in the queue.
+	 */
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+	prod_cpu(cpu);
+}
+
+extern void __pv_init_lock_hash(void);
+
+static inline void pv_spinlocks_init(void)
+{
+	__pv_init_lock_hash();
+}
+
+#endif
+
 #include <asm-generic/qspinlock.h>
 
 #endif /* _ASM_POWERPC_QSPINLOCK_H */
diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000000..750d1b5e0202
--- /dev/null
+++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,7 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+EXPORT_SYMBOL(__pv_queued_spin_unlock);
+
+#endif /* __ASM_QSPINLOCK_PARAVIRT_H */
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 24c18362e5ea..756e727b383f 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -25,9 +25,14 @@  config PPC_PSERIES
 	select SWIOTLB
 	default y
 
+config PARAVIRT_SPINLOCKS
+	bool
+	default n
+
 config PPC_SPLPAR
 	depends on PPC_PSERIES
 	bool "Support for shared-processor logical partitions"
+	select PARAVIRT_SPINLOCKS if PPC_QUEUED_SPINLOCKS
 	help
 	  Enabling this option will make the kernel run more efficiently
 	  on logically-partitioned pSeries systems which use shared
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 2db8469e475f..747a203d9453 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -771,8 +771,12 @@  static void __init pSeries_setup_arch(void)
 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
 		vpa_init(boot_cpuid);
 
-		if (lppaca_shared_proc(get_lppaca()))
+		if (lppaca_shared_proc(get_lppaca())) {
 			static_branch_enable(&shared_processor);
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+			pv_spinlocks_init();
+#endif
+		}
 
 		ppc_md.power_save = pseries_lpar_idle;
 		ppc_md.enable_pmcs = pseries_lpar_enable_pmcs;
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index fb0a814d4395..38ca14e79a86 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -69,6 +69,7 @@  static __always_inline int queued_spin_trylock(struct qspinlock *lock)
 
 extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 
+#ifndef queued_spin_lock
 /**
  * queued_spin_lock - acquire a queued spinlock
  * @lock: Pointer to queued spinlock structure
@@ -82,6 +83,7 @@  static __always_inline void queued_spin_lock(struct qspinlock *lock)
 
 	queued_spin_lock_slowpath(lock, val);
 }
+#endif
 
 #ifndef queued_spin_unlock
 /**