diff mbox series

[03/17] powerpc/qspinlock: use a half-word store to unlock to avoid larx/stcx.

Message ID 20220728063120.2867508-5-npiggin@gmail.com (mailing list archive)
State Changes Requested
Headers show
Series powerpc: alternate queued spinlock implementation | expand

Commit Message

Nicholas Piggin July 28, 2022, 6:31 a.m. UTC
The first 16 bits of the lock are only modified by the owner, and other
modifications always use atomic operations on the entire 32 bits, so
unlocks can use plain stores on the 16 bits. This is the same kind of
optimisation done by core qspinlock code.
---
 arch/powerpc/include/asm/qspinlock.h       |  6 +-----
 arch/powerpc/include/asm/qspinlock_types.h | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 7 deletions(-)

Comments

Jordan Niethe Aug. 10, 2022, 3:28 a.m. UTC | #1
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> The first 16 bits of the lock are only modified by the owner, and other
> modifications always use atomic operations on the entire 32 bits, so
> unlocks can use plain stores on the 16 bits. This is the same kind of
> optimisation done by core qspinlock code.
> ---
>  arch/powerpc/include/asm/qspinlock.h       |  6 +-----
>  arch/powerpc/include/asm/qspinlock_types.h | 19 +++++++++++++++++--
>  2 files changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
> index f06117aa60e1..79a1936fb68d 100644
> --- a/arch/powerpc/include/asm/qspinlock.h
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -38,11 +38,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
>  
>  static inline void queued_spin_unlock(struct qspinlock *lock)
>  {
> -	for (;;) {
> -		int val = atomic_read(&lock->val);
> -		if (atomic_cmpxchg_release(&lock->val, val, val & ~_Q_LOCKED_VAL) == val)
> -			return;
> -	}
> +	smp_store_release(&lock->locked, 0);

Is it also possible for lock_set_locked() to use a non-atomic acquire
operation?

>  }
>  
>  #define arch_spin_is_locked(l)		queued_spin_is_locked(l)
> diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h
> index 9630e714c70d..3425dab42576 100644
> --- a/arch/powerpc/include/asm/qspinlock_types.h
> +++ b/arch/powerpc/include/asm/qspinlock_types.h
> @@ -3,12 +3,27 @@
>  #define _ASM_POWERPC_QSPINLOCK_TYPES_H
>  
>  #include <linux/types.h>
> +#include <asm/byteorder.h>
>  
>  typedef struct qspinlock {
> -	atomic_t val;
> +	union {
> +		atomic_t val;
> +
> +#ifdef __LITTLE_ENDIAN
> +		struct {
> +			u16	locked;
> +			u8	reserved[2];
> +		};
> +#else
> +		struct {
> +			u8	reserved[2];
> +			u16	locked;
> +		};
> +#endif
> +	};
>  } arch_spinlock_t;

Just to double check we have:

#define _Q_LOCKED_OFFSET	0
#define _Q_LOCKED_BITS		1
#define _Q_LOCKED_MASK		0x00000001
#define _Q_LOCKED_VAL		1

#define _Q_TAIL_CPU_OFFSET	16
#define _Q_TAIL_CPU_BITS	16
#define _Q_TAIL_CPU_MASK	0xffff0000


so the ordering here looks correct.

>  
> -#define	__ARCH_SPIN_LOCK_UNLOCKED	{ .val = ATOMIC_INIT(0) }
> +#define	__ARCH_SPIN_LOCK_UNLOCKED	{ { .val = ATOMIC_INIT(0) } }
>  
>  /*
>   * Bitfields in the atomic value:
Jordan Niethe Nov. 10, 2022, 12:39 a.m. UTC | #2
On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
[resend as utf-8, not utf-7]
> The first 16 bits of the lock are only modified by the owner, and other
> modifications always use atomic operations on the entire 32 bits, so
> unlocks can use plain stores on the 16 bits. This is the same kind of
> optimisation done by core qspinlock code.
> ---
>  arch/powerpc/include/asm/qspinlock.h       |  6 +-----
>  arch/powerpc/include/asm/qspinlock_types.h | 19 +++++++++++++++++--
>  2 files changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
> index f06117aa60e1..79a1936fb68d 100644
> --- a/arch/powerpc/include/asm/qspinlock.h
> +++ b/arch/powerpc/include/asm/qspinlock.h
> @@ -38,11 +38,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
>  
>  static inline void queued_spin_unlock(struct qspinlock *lock)
>  {
> -	for (;;) {
> -		int val = atomic_read(&lock->val);
> -		if (atomic_cmpxchg_release(&lock->val, val, val & ~_Q_LOCKED_VAL) == val)
> -			return;
> -	}
> +	smp_store_release(&lock->locked, 0);

Is it also possible for lock_set_locked() to use a non-atomic acquire
operation?

>  }
>  
>  #define arch_spin_is_locked(l)		queued_spin_is_locked(l)
> diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h
> index 9630e714c70d..3425dab42576 100644
> --- a/arch/powerpc/include/asm/qspinlock_types.h
> +++ b/arch/powerpc/include/asm/qspinlock_types.h
> @@ -3,12 +3,27 @@
>  #define _ASM_POWERPC_QSPINLOCK_TYPES_H
>  
>  #include <linux/types.h>
> +#include <asm/byteorder.h>
>  
>  typedef struct qspinlock {
> -	atomic_t val;
> +	union {
> +		atomic_t val;
> +
> +#ifdef __LITTLE_ENDIAN
> +		struct {
> +			u16	locked;
> +			u8	reserved[2];
> +		};
> +#else
> +		struct {
> +			u8	reserved[2];
> +			u16	locked;
> +		};
> +#endif
> +	};
>  } arch_spinlock_t;

Just to double check we have:

#define _Q_LOCKED_OFFSET	0
#define _Q_LOCKED_BITS		1
#define _Q_LOCKED_MASK		0x00000001
#define _Q_LOCKED_VAL		1

#define _Q_TAIL_CPU_OFFSET	16
#define _Q_TAIL_CPU_BITS	16
#define _Q_TAIL_CPU_MASK	0xffff0000


so the ordering here looks correct.

>  
> -#define	__ARCH_SPIN_LOCK_UNLOCKED	{ .val = ATOMIC_INIT(0) }
> +#define	__ARCH_SPIN_LOCK_UNLOCKED	{ { .val = ATOMIC_INIT(0) } }
>  
>  /*
>   * Bitfields in the atomic value:
Nicholas Piggin Nov. 10, 2022, 9:25 a.m. UTC | #3
On Thu Nov 10, 2022 at 10:39 AM AEST, Jordan Niethe wrote:
> On Thu, 2022-07-28 at 16:31 +1000, Nicholas Piggin wrote:
> [resend as utf-8, not utf-7]
> > The first 16 bits of the lock are only modified by the owner, and other
> > modifications always use atomic operations on the entire 32 bits, so
> > unlocks can use plain stores on the 16 bits. This is the same kind of
> > optimisation done by core qspinlock code.
> > ---
> >  arch/powerpc/include/asm/qspinlock.h       |  6 +-----
> >  arch/powerpc/include/asm/qspinlock_types.h | 19 +++++++++++++++++--
> >  2 files changed, 18 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
> > index f06117aa60e1..79a1936fb68d 100644
> > --- a/arch/powerpc/include/asm/qspinlock.h
> > +++ b/arch/powerpc/include/asm/qspinlock.h
> > @@ -38,11 +38,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
> >  
> >  static inline void queued_spin_unlock(struct qspinlock *lock)
> >  {
> > -	for (;;) {
> > -		int val = atomic_read(&lock->val);
> > -		if (atomic_cmpxchg_release(&lock->val, val, val & ~_Q_LOCKED_VAL) == val)
> > -			return;
> > -	}
> > +	smp_store_release(&lock->locked, 0);
>
> Is it also possible for lock_set_locked() to use a non-atomic acquire
> operation?

It has to be atomic as mentioned earlier.

Thanks,
Nick
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/qspinlock.h b/arch/powerpc/include/asm/qspinlock.h
index f06117aa60e1..79a1936fb68d 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -38,11 +38,7 @@  static __always_inline void queued_spin_lock(struct qspinlock *lock)
 
 static inline void queued_spin_unlock(struct qspinlock *lock)
 {
-	for (;;) {
-		int val = atomic_read(&lock->val);
-		if (atomic_cmpxchg_release(&lock->val, val, val & ~_Q_LOCKED_VAL) == val)
-			return;
-	}
+	smp_store_release(&lock->locked, 0);
 }
 
 #define arch_spin_is_locked(l)		queued_spin_is_locked(l)
diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h
index 9630e714c70d..3425dab42576 100644
--- a/arch/powerpc/include/asm/qspinlock_types.h
+++ b/arch/powerpc/include/asm/qspinlock_types.h
@@ -3,12 +3,27 @@ 
 #define _ASM_POWERPC_QSPINLOCK_TYPES_H
 
 #include <linux/types.h>
+#include <asm/byteorder.h>
 
 typedef struct qspinlock {
-	atomic_t val;
+	union {
+		atomic_t val;
+
+#ifdef __LITTLE_ENDIAN
+		struct {
+			u16	locked;
+			u8	reserved[2];
+		};
+#else
+		struct {
+			u8	reserved[2];
+			u16	locked;
+		};
+#endif
+	};
 } arch_spinlock_t;
 
-#define	__ARCH_SPIN_LOCK_UNLOCKED	{ .val = ATOMIC_INIT(0) }
+#define	__ARCH_SPIN_LOCK_UNLOCKED	{ { .val = ATOMIC_INIT(0) } }
 
 /*
  * Bitfields in the atomic value: