diff mbox series

[1/1] ppc/crash: Skip spinlocks during crash

Message ID 20200326222836.501404-1-leonardo@linux.ibm.com (mailing list archive)
State Superseded, archived
Headers show
Series [1/1] ppc/crash: Skip spinlocks during crash | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (c6624071c338732402e8c726df6a4074473eaa0e)
snowpatch_ozlabs/build-ppc64le fail build failed!
snowpatch_ozlabs/build-ppc64be fail build failed!
snowpatch_ozlabs/build-ppc64e fail build failed!
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 39 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Leonardo Bras March 26, 2020, 10:28 p.m. UTC
During a crash, there is chance that the cpus that handle the NMI IPI
are holding a spin_lock. If this spin_lock is needed by crashing_cpu it
will cause a deadlock. (rtas_lock and printk logbuf_log as of today)

This is a problem if the system has kdump set up, given if it crashes
for any reason kdump may not be saved for crash analysis.

Skip spinlocks after NMI IPI is sent to all other cpus.

Signed-off-by: Leonardo Bras <leonardo@linux.ibm.com>
---
 arch/powerpc/include/asm/spinlock.h | 6 ++++++
 arch/powerpc/kexec/crash.c          | 3 +++
 2 files changed, 9 insertions(+)

Comments

Leonardo Bras March 26, 2020, 11:26 p.m. UTC | #1
oops, forgot to EXPORT_SYMBOL. 
arch_spin_lock*() is used on modules.

Sending v2.

On Thu, 2020-03-26 at 19:28 -0300, Leonardo Bras wrote:
> During a crash, there is chance that the cpus that handle the NMI IPI
> are holding a spin_lock. If this spin_lock is needed by crashing_cpu it
> will cause a deadlock. (rtas_lock and printk logbuf_log as of today)
> 
> This is a problem if the system has kdump set up, given if it crashes
> for any reason kdump may not be saved for crash analysis.
> 
> Skip spinlocks after NMI IPI is sent to all other cpus.
> 
> Signed-off-by: Leonardo Bras <leonardo@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/spinlock.h | 6 ++++++
>  arch/powerpc/kexec/crash.c          | 3 +++
>  2 files changed, 9 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 860228e917dc..a6381d110795 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -111,6 +111,8 @@ static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
>  static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
>  #endif
>  
> +extern bool crash_skip_spinlock __read_mostly;
> +
>  static inline bool is_shared_processor(void)
>  {
>  #ifdef CONFIG_PPC_SPLPAR
> @@ -142,6 +144,8 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
>  		if (likely(__arch_spin_trylock(lock) == 0))
>  			break;
>  		do {
> +			if (unlikely(crash_skip_spinlock))
> +				return;
>  			HMT_low();
>  			if (is_shared_processor())
>  				splpar_spin_yield(lock);
> @@ -161,6 +165,8 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
>  		local_save_flags(flags_dis);
>  		local_irq_restore(flags);
>  		do {
> +			if (unlikely(crash_skip_spinlock))
> +				return;
>  			HMT_low();
>  			if (is_shared_processor())
>  				splpar_spin_yield(lock);
> diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
> index d488311efab1..8a522380027d 100644
> --- a/arch/powerpc/kexec/crash.c
> +++ b/arch/powerpc/kexec/crash.c
> @@ -66,6 +66,8 @@ static int handle_fault(struct pt_regs *regs)
>  
>  #ifdef CONFIG_SMP
>  
> +bool crash_skip_spinlock;
> +
>  static atomic_t cpus_in_crash;
>  void crash_ipi_callback(struct pt_regs *regs)
>  {
> @@ -129,6 +131,7 @@ static void crash_kexec_prepare_cpus(int cpu)
>  	/* Would it be better to replace the trap vector here? */
>  
>  	if (atomic_read(&cpus_in_crash) >= ncpus) {
> +		crash_skip_spinlock = true;
>  		printk(KERN_EMERG "IPI complete\n");
>  		return;
>  	}
Christophe Leroy March 27, 2020, 6:50 a.m. UTC | #2
Le 26/03/2020 à 23:28, Leonardo Bras a écrit :
> During a crash, there is chance that the cpus that handle the NMI IPI
> are holding a spin_lock. If this spin_lock is needed by crashing_cpu it
> will cause a deadlock. (rtas_lock and printk logbuf_log as of today)
> 
> This is a problem if the system has kdump set up, given if it crashes
> for any reason kdump may not be saved for crash analysis.
> 
> Skip spinlocks after NMI IPI is sent to all other cpus.
> 
> Signed-off-by: Leonardo Bras <leonardo@linux.ibm.com>
> ---
>   arch/powerpc/include/asm/spinlock.h | 6 ++++++
>   arch/powerpc/kexec/crash.c          | 3 +++
>   2 files changed, 9 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 860228e917dc..a6381d110795 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -111,6 +111,8 @@ static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
>   static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
>   #endif
>   
> +extern bool crash_skip_spinlock __read_mostly;
> +
>   static inline bool is_shared_processor(void)
>   {
>   #ifdef CONFIG_PPC_SPLPAR
> @@ -142,6 +144,8 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
>   		if (likely(__arch_spin_trylock(lock) == 0))
>   			break;
>   		do {
> +			if (unlikely(crash_skip_spinlock))
> +				return;

You are adding a test that reads a global var in the middle of a so hot 
path ? That must kill performance. Can we do different ?

Christophe
Leonardo Bras March 27, 2020, 3:51 p.m. UTC | #3
Hello Christophe, thanks for the feedback.

I noticed an error in this patch and sent a v2, that can be seen here:
http://patchwork.ozlabs.org/patch/1262468/

Comments inline::

On Fri, 2020-03-27 at 07:50 +0100, Christophe Leroy wrote:
> > @@ -142,6 +144,8 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
> >   		if (likely(__arch_spin_trylock(lock) == 0))
> >   			break;
> >   		do {
> > +			if (unlikely(crash_skip_spinlock))
> > +				return;

Complete function for reference:
static inline void arch_spin_lock(arch_spinlock_t *lock)
{
	while (1) {
		if (likely(__arch_spin_trylock(lock) == 0))
			break;
		do {
			if (unlikely(crash_skip_spinlock))
				return;
			HMT_low();
			if (is_shared_processor())
				splpar_spin_yield(lock);
		} while (unlikely(lock->slock != 0));
		HMT_medium();
	}
}

> You are adding a test that reads a global var in the middle of a so hot 
> path ? That must kill performance. 

I thought it would, in worst case scenario, increase a maximum delay of
an arch_spin_lock() call 1 spin cycle. Here is what I thought:

- If the lock is already free, it would change nothing, 
- Otherwise, the lock will wait.
- Waiting cycle just got bigger.
- Worst case scenario: running one more cycle, given lock->slock can
turn to 0 just after checking.

Could you please point where I failed to see the performance penalty?
(I need to get better at this :) )


> Can we do different ?

Sure, a less intrusive way of doing it would be to free the currently
needed locks before proceeding. I just thought it would be harder to
maintain.

> Christophe

Best regards,
Leonardo
Christophe Leroy March 28, 2020, 10:19 a.m. UTC | #4
Hi Leonardo,

On 03/27/2020 03:51 PM, Leonardo Bras wrote:
> Hello Christophe, thanks for the feedback.
> 
> I noticed an error in this patch and sent a v2, that can be seen here:
> http://patchwork.ozlabs.org/patch/1262468/
> 
> Comments inline::
> 
> On Fri, 2020-03-27 at 07:50 +0100, Christophe Leroy wrote:
>>> @@ -142,6 +144,8 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
>>>    		if (likely(__arch_spin_trylock(lock) == 0))
>>>    			break;
>>>    		do {
>>> +			if (unlikely(crash_skip_spinlock))
>>> +				return;
> 
> Complete function for reference:
> static inline void arch_spin_lock(arch_spinlock_t *lock)
> {
> 	while (1) {
> 		if (likely(__arch_spin_trylock(lock) == 0))
> 			break;
> 		do {
> 			if (unlikely(crash_skip_spinlock))
> 				return;
> 			HMT_low();
> 			if (is_shared_processor())
> 				splpar_spin_yield(lock);
> 		} while (unlikely(lock->slock != 0));
> 		HMT_medium();
> 	}
> }
> 
>> You are adding a test that reads a global var in the middle of a so hot
>> path ? That must kill performance.
> 
> I thought it would, in worst case scenario, increase a maximum delay of
> an arch_spin_lock() call 1 spin cycle. Here is what I thought:
> 
> - If the lock is already free, it would change nothing,
> - Otherwise, the lock will wait.
> - Waiting cycle just got bigger.
> - Worst case scenario: running one more cycle, given lock->slock can
> turn to 0 just after checking.
> 
> Could you please point where I failed to see the performance penalty?
> (I need to get better at this :) )

You are right that when the lock is free, it changes nothing. However 
when it is not, it is not just one cycle.

Here is arch_spin_lock() without your patch:

00000440 <my_lock>:
  440:	39 40 00 01 	li      r10,1
  444:	7d 20 18 28 	lwarx   r9,0,r3
  448:	2c 09 00 00 	cmpwi   r9,0
  44c:	40 82 00 10 	bne     45c <my_lock+0x1c>
  450:	7d 40 19 2d 	stwcx.  r10,0,r3
  454:	40 a2 ff f0 	bne     444 <my_lock+0x4>
  458:	4c 00 01 2c 	isync
  45c:	2f 89 00 00 	cmpwi   cr7,r9,0
  460:	4d be 00 20 	bclr+   12,4*cr7+eq
  464:	7c 21 0b 78 	mr      r1,r1
  468:	81 23 00 00 	lwz     r9,0(r3)
  46c:	2f 89 00 00 	cmpwi   cr7,r9,0
  470:	40 be ff f4 	bne     cr7,464 <my_lock+0x24>
  474:	7c 42 13 78 	mr      r2,r2
  478:	7d 20 18 28 	lwarx   r9,0,r3
  47c:	2c 09 00 00 	cmpwi   r9,0
  480:	40 82 00 10 	bne     490 <my_lock+0x50>
  484:	7d 40 19 2d 	stwcx.  r10,0,r3
  488:	40 a2 ff f0 	bne     478 <my_lock+0x38>
  48c:	4c 00 01 2c 	isync
  490:	2f 89 00 00 	cmpwi   cr7,r9,0
  494:	40 be ff d0 	bne     cr7,464 <my_lock+0x24>
  498:	4e 80 00 20 	blr

Here is arch_spin_lock() with your patch. I enclose with === what comes 
in addition:

00000440 <my_lock>:
  440:	39 40 00 01 	li      r10,1
  444:	7d 20 18 28 	lwarx   r9,0,r3
  448:	2c 09 00 00 	cmpwi   r9,0
  44c:	40 82 00 10 	bne     45c <my_lock+0x1c>
  450:	7d 40 19 2d 	stwcx.  r10,0,r3
  454:	40 a2 ff f0 	bne     444 <my_lock+0x4>
  458:	4c 00 01 2c 	isync
  45c:	2f 89 00 00 	cmpwi   cr7,r9,0
  460:	4d be 00 20 	bclr+   12,4*cr7+eq
=====================================================
  464:	3d 40 00 00 	lis     r10,0
			466: R_PPC_ADDR16_HA	crash_skip_spinlock
  468:	39 4a 00 00 	addi    r10,r10,0
			46a: R_PPC_ADDR16_LO	crash_skip_spinlock
  46c:	39 00 00 01 	li      r8,1
  470:	89 2a 00 00 	lbz     r9,0(r10)
  474:	2f 89 00 00 	cmpwi   cr7,r9,0
  478:	4c 9e 00 20 	bnelr   cr7
=====================================================
  47c:	7c 21 0b 78 	mr      r1,r1
  480:	81 23 00 00 	lwz     r9,0(r3)
  484:	2f 89 00 00 	cmpwi   cr7,r9,0
  488:	40 be ff f4 	bne     cr7,47c <my_lock+0x3c>
  48c:	7c 42 13 78 	mr      r2,r2
  490:	7d 20 18 28 	lwarx   r9,0,r3
  494:	2c 09 00 00 	cmpwi   r9,0
  498:	40 82 00 10 	bne     4a8 <my_lock+0x68>
  49c:	7d 00 19 2d 	stwcx.  r8,0,r3
  4a0:	40 a2 ff f0 	bne     490 <my_lock+0x50>
  4a4:	4c 00 01 2c 	isync
  4a8:	2f 89 00 00 	cmpwi   cr7,r9,0
  4ac:	40 be ff c4 	bne     cr7,470 <my_lock+0x30>
  4b0:	4e 80 00 20 	blr


Christophe
Peter Zijlstra March 30, 2020, 11:02 a.m. UTC | #5
On Fri, Mar 27, 2020 at 07:50:20AM +0100, Christophe Leroy wrote:
> 
> 
> Le 26/03/2020 à 23:28, Leonardo Bras a écrit :
> > During a crash, there is chance that the cpus that handle the NMI IPI
> > are holding a spin_lock. If this spin_lock is needed by crashing_cpu it
> > will cause a deadlock. (rtas_lock and printk logbuf_log as of today)
> > 
> > This is a problem if the system has kdump set up, given if it crashes
> > for any reason kdump may not be saved for crash analysis.
> > 
> > Skip spinlocks after NMI IPI is sent to all other cpus.
> > 
> > Signed-off-by: Leonardo Bras <leonardo@linux.ibm.com>
> > ---
> >   arch/powerpc/include/asm/spinlock.h | 6 ++++++
> >   arch/powerpc/kexec/crash.c          | 3 +++
> >   2 files changed, 9 insertions(+)
> > 
> > diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> > index 860228e917dc..a6381d110795 100644
> > --- a/arch/powerpc/include/asm/spinlock.h
> > +++ b/arch/powerpc/include/asm/spinlock.h
> > @@ -111,6 +111,8 @@ static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
> >   static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
> >   #endif
> > +extern bool crash_skip_spinlock __read_mostly;
> > +
> >   static inline bool is_shared_processor(void)
> >   {
> >   #ifdef CONFIG_PPC_SPLPAR
> > @@ -142,6 +144,8 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
> >   		if (likely(__arch_spin_trylock(lock) == 0))
> >   			break;
> >   		do {
> > +			if (unlikely(crash_skip_spinlock))
> > +				return;
> 
> You are adding a test that reads a global var in the middle of a so hot path
> ? That must kill performance. Can we do different ?

This; adding code to a super hot patch like this for an exceptional case
like the crash handling seems like a very very bad trade to me.

One possible solution is to simply write 0 to the affected spinlocks
after sending the NMI IPI thing, no?
Leonardo Bras March 30, 2020, 2:12 p.m. UTC | #6
Hello Peter, 

On Mon, 2020-03-30 at 13:02 +0200, Peter Zijlstra wrote:
>  		do {
> > > +			if (unlikely(crash_skip_spinlock))
> > > +				return;
> > 
> > You are adding a test that reads a global var in the middle of a so hot path
> > ? That must kill performance. Can we do different ?
> 
> This; adding code to a super hot patch like this for an exceptional case
> like the crash handling seems like a very very bad trade to me.
> 
> One possible solution is to simply write 0 to the affected spinlocks
> after sending the NMI IPI thing, no?

Yes, I agree.
I suggested this on a comment in v2 of this patch:
http://patchwork.ozlabs.org/patch/1262468/
Leonardo Bras March 30, 2020, 2:33 p.m. UTC | #7
Hello Christophe,

On Sat, 2020-03-28 at 10:19 +0000, Christophe Leroy wrote:
> Hi Leonardo,
> 
> 
> > On 03/27/2020 03:51 PM, Leonardo Bras wrote:
> > > 
> > [SNIP]
> > - If the lock is already free, it would change nothing,
> > - Otherwise, the lock will wait.
> > - Waiting cycle just got bigger.
> > - Worst case scenario: running one more cycle, given lock->slock can
> > turn to 0 just after checking.Could you please point where I failed to see the performance penalty?
> > (I need to get better at this :) )
> 
> You are right that when the lock is free, it changes nothing. However 
> when it is not, it is not just one cycle.

Sorry, what I meant here is one "waiting cycle", meaning that in WCS
there would be 1 extra iteration on that while. Or it would 'spin' one
more time.

> 
> Here is arch_spin_lock() without your patch:
> 
> 00000440 <my_lock>:
>   440:	39 40 00 01 	li      r10,1
>   444:	7d 20 18 28 	lwarx   r9,0,r3
>   448:	2c 09 00 00 	cmpwi   r9,0
>   44c:	40 82 00 10 	bne     45c <my_lock+0x1c>
>   450:	7d 40 19 2d 	stwcx.  r10,0,r3
>   454:	40 a2 ff f0 	bne     444 <my_lock+0x4>
>   458:	4c 00 01 2c 	isync
>   45c:	2f 89 00 00 	cmpwi   cr7,r9,0
>   460:	4d be 00 20 	bclr+   12,4*cr7+eq
>   464:	7c 21 0b 78 	mr      r1,r1
>   468:	81 23 00 00 	lwz     r9,0(r3)
>   46c:	2f 89 00 00 	cmpwi   cr7,r9,0
>   470:	40 be ff f4 	bne     cr7,464 <my_lock+0x24>
>   474:	7c 42 13 78 	mr      r2,r2
>   478:	7d 20 18 28 	lwarx   r9,0,r3
>   47c:	2c 09 00 00 	cmpwi   r9,0
>   480:	40 82 00 10 	bne     490 <my_lock+0x50>
>   484:	7d 40 19 2d 	stwcx.  r10,0,r3
>   488:	40 a2 ff f0 	bne     478 <my_lock+0x38>
>   48c:	4c 00 01 2c 	isync
>   490:	2f 89 00 00 	cmpwi   cr7,r9,0
>   494:	40 be ff d0 	bne     cr7,464 <my_lock+0x24>
>   498:	4e 80 00 20 	blr
> 
> Here is arch_spin_lock() with your patch. I enclose with === what comes 
> in addition:
> 
> 00000440 <my_lock>:
>   440:	39 40 00 01 	li      r10,1
>   444:	7d 20 18 28 	lwarx   r9,0,r3
>   448:	2c 09 00 00 	cmpwi   r9,0
>   44c:	40 82 00 10 	bne     45c <my_lock+0x1c>
>   450:	7d 40 19 2d 	stwcx.  r10,0,r3
>   454:	40 a2 ff f0 	bne     444 <my_lock+0x4>
>   458:	4c 00 01 2c 	isync
>   45c:	2f 89 00 00 	cmpwi   cr7,r9,0
>   460:	4d be 00 20 	bclr+   12,4*cr7+eq
> =====================================================
>   464:	3d 40 00 00 	lis     r10,0
> 			466: R_PPC_ADDR16_HA	crash_skip_spinlock
>   468:	39 4a 00 00 	addi    r10,r10,0
> 			46a: R_PPC_ADDR16_LO	crash_skip_spinlock
>   46c:	39 00 00 01 	li      r8,1
>   470:	89 2a 00 00 	lbz     r9,0(r10)
>   474:	2f 89 00 00 	cmpwi   cr7,r9,0
>   478:	4c 9e 00 20 	bnelr   cr7
> =====================================================
>   47c:	7c 21 0b 78 	mr      r1,r1
>   480:	81 23 00 00 	lwz     r9,0(r3)
>   484:	2f 89 00 00 	cmpwi   cr7,r9,0
>   488:	40 be ff f4 	bne     cr7,47c <my_lock+0x3c>
>   48c:	7c 42 13 78 	mr      r2,r2
>   490:	7d 20 18 28 	lwarx   r9,0,r3
>   494:	2c 09 00 00 	cmpwi   r9,0
>   498:	40 82 00 10 	bne     4a8 <my_lock+0x68>
>   49c:	7d 00 19 2d 	stwcx.  r8,0,r3
>   4a0:	40 a2 ff f0 	bne     490 <my_lock+0x50>
>   4a4:	4c 00 01 2c 	isync
>   4a8:	2f 89 00 00 	cmpwi   cr7,r9,0
>   4ac:	40 be ff c4 	bne     cr7,470 <my_lock+0x30>
>   4b0:	4e 80 00 20 	blr
> 
> 
> Christophe

I agree. When there is waiting, it will usually add some time to it.
Accounting that spinlocks are widely used, it will cause a slowdown in
the whole system.

Thanks for the feedback,
Best regards,
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 860228e917dc..a6381d110795 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -111,6 +111,8 @@  static inline void splpar_spin_yield(arch_spinlock_t *lock) {};
 static inline void splpar_rw_yield(arch_rwlock_t *lock) {};
 #endif
 
+extern bool crash_skip_spinlock __read_mostly;
+
 static inline bool is_shared_processor(void)
 {
 #ifdef CONFIG_PPC_SPLPAR
@@ -142,6 +144,8 @@  static inline void arch_spin_lock(arch_spinlock_t *lock)
 		if (likely(__arch_spin_trylock(lock) == 0))
 			break;
 		do {
+			if (unlikely(crash_skip_spinlock))
+				return;
 			HMT_low();
 			if (is_shared_processor())
 				splpar_spin_yield(lock);
@@ -161,6 +165,8 @@  void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 		local_save_flags(flags_dis);
 		local_irq_restore(flags);
 		do {
+			if (unlikely(crash_skip_spinlock))
+				return;
 			HMT_low();
 			if (is_shared_processor())
 				splpar_spin_yield(lock);
diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c
index d488311efab1..8a522380027d 100644
--- a/arch/powerpc/kexec/crash.c
+++ b/arch/powerpc/kexec/crash.c
@@ -66,6 +66,8 @@  static int handle_fault(struct pt_regs *regs)
 
 #ifdef CONFIG_SMP
 
+bool crash_skip_spinlock;
+
 static atomic_t cpus_in_crash;
 void crash_ipi_callback(struct pt_regs *regs)
 {
@@ -129,6 +131,7 @@  static void crash_kexec_prepare_cpus(int cpu)
 	/* Would it be better to replace the trap vector here? */
 
 	if (atomic_read(&cpus_in_crash) >= ncpus) {
+		crash_skip_spinlock = true;
 		printk(KERN_EMERG "IPI complete\n");
 		return;
 	}