diff mbox

Convert powerpc simple spinlocks into ticket locks

Message ID 20140206103736.GA18054@lst.de (mailing list archive)
State Superseded
Headers show

Commit Message

Torsten Duwe Feb. 6, 2014, 10:37 a.m. UTC
x86 has them, MIPS has them, ARM has them, even ia64 has them:
ticket locks. They reduce memory bus and cache pressure especially
for contended spinlocks, increasing performance.

This patch is a port of the x86 spin locks, mostly written in C,
to the powerpc, introducing inline asm where needed. The pSeries
directed yield for vCPUs is taken care of by an additional "holder"
field in the lock.

Signed-off-by: Torsten Duwe <duwe@suse.de>
--
 arch/powerpc/include/asm/spinlock_types.h |   27 ++++-
 arch/powerpc/include/asm/spinlock.h       |  157 +++++++++++++++++++++++-------
 arch/powerpc/lib/locks.c                  |    6 -
 3 files changed, 151 insertions(+), 39 deletions(-)

Comments

Benjamin Herrenschmidt Feb. 6, 2014, 3:53 p.m. UTC | #1
On Thu, 2014-02-06 at 11:37 +0100, Torsten Duwe wrote:
> x86 has them, MIPS has them, ARM has them, even ia64 has them:
> ticket locks. They reduce memory bus and cache pressure especially
> for contended spinlocks, increasing performance.
> 
> This patch is a port of the x86 spin locks, mostly written in C,
> to the powerpc, introducing inline asm where needed. The pSeries
> directed yield for vCPUs is taken care of by an additional "holder"
> field in the lock.

Thanks, I've been meaning to look into this for ages and never quite got
to it :-)

I'm travelling right now, I'll review this next week.

Cheers,
Ben.

> Signed-off-by: Torsten Duwe <duwe@suse.de>
> --
>  arch/powerpc/include/asm/spinlock_types.h |   27 ++++-
>  arch/powerpc/include/asm/spinlock.h       |  157 +++++++++++++++++++++++-------
>  arch/powerpc/lib/locks.c                  |    6 -
>  3 files changed, 151 insertions(+), 39 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
> index 2351adc..64a98be 100644
> --- a/arch/powerpc/include/asm/spinlock_types.h
> +++ b/arch/powerpc/include/asm/spinlock_types.h
> @@ -5,11 +5,30 @@
>  # error "please don't include this file directly"
>  #endif
>  
> -typedef struct {
> -	volatile unsigned int slock;
> -} arch_spinlock_t;
> +typedef u16 __ticket_t;
> +typedef u32 __ticketpair_t;
> +
> +#define TICKET_LOCK_INC	((__ticket_t)1)
> +
> +#define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
> +
> +typedef struct arch_spinlock {
> +	union {
> +		__ticketpair_t head_tail;
> +		struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__		/* The "tail" part should be in the MSBs */
> +			__ticket_t tail, head;
> +#else
> +			__ticket_t head, tail;
> +#endif
> +		} tickets;
> +	};
> +#if defined(CONFIG_PPC64)
> +	u32 holder;
> +#endif
> +} arch_spinlock_t __aligned(8);
>  
> -#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
> +#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 }, 0 }
>  
>  typedef struct {
>  	volatile signed int lock;
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 5f54a74..6e33691 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -9,8 +9,7 @@
>   * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
>   * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
>   *	Rework to support virtual processors
> - *
> - * Type of int is used as a full 64b word is not necessary.
> + * Copyright (C) 2014 Torsten Duwe <duwe@suse.de>, ticket lock port
>   *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License
> @@ -28,7 +27,20 @@
>  #include <asm/synch.h>
>  #include <asm/ppc-opcode.h>
>  
> -#define arch_spin_is_locked(x)		((x)->slock != 0)
> +static inline int arch_spin_is_locked(arch_spinlock_t *lock)
> +{
> +	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> +
> +	return tmp.tail != tmp.head;
> +}
> +
> +static inline int arch_spin_is_contended(arch_spinlock_t *lock)
> +{
> +	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
> +
> +	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
> +}
> +#define arch_spin_is_contended	arch_spin_is_contended
>  
>  #ifdef CONFIG_PPC64
>  /* use 0x800000yy when locked, where yy == CPU number */
> @@ -37,8 +49,12 @@
>  #else
>  #define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
>  #endif
> +#define SIZE_LETTER	"d"
> +#define L64		"1"
>  #else
>  #define LOCK_TOKEN	1
> +#define SIZE_LETTER	"w"
> +#define L64		"0"
>  #endif
>  
>  #if defined(CONFIG_PPC64) && defined(CONFIG_SMP)
> @@ -55,33 +71,59 @@
>  #endif
>  
>  /*
> - * This returns the old value in the lock, so we succeeded
> - * in getting the lock if the return value is 0.
> + * Our own cmpxchg, operating on spinlock_t's.  Returns 0 iff value
> + * read at lock was equal to "old" AND the cmpxchg succeeded
> + * uninterruptedly.
>   */
> -static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
> +static __always_inline int __arch_spin_cmpxchg_eq(arch_spinlock_t *lock,
> +					       arch_spinlock_t old,
> +					       arch_spinlock_t new)
>  {
> -	unsigned long tmp, token;
> +	register int retval = 1;
> +	register arch_spinlock_t tmp;
> +
> +	__asm__ __volatile__ (
> +"	li %0,1\n"		/* default to "fail" */
> +	PPC_RELEASE_BARRIER
> +"1:	l" SIZE_LETTER "arx   %2,0,%5         # __arch_spin_cmpxchg_eq\n"
> +"	cmp 0,"L64",%3,%2\n"
> +"	bne-    2f\n"
> +	PPC405_ERR77(0, "%5")
> +"	st" SIZE_LETTER "cx.  %4,0,%5\n"
> +"	bne-    1b\n"
> +"	isync\n"
> +"	li %0,0\n"
> +"2:"
> +	: "=&r" (retval), "+m" (*lock)
> +	: "r" (tmp), "r" (old), "r" (new), "r" (lock)
> +	: "cc", "memory");
> +
> +	return retval;
> +}
> +#undef SIZE_LETTER
> +#undef L64
>  
> -	token = LOCK_TOKEN;
> -	__asm__ __volatile__(
> -"1:	" PPC_LWARX(%0,0,%2,1) "\n\
> -	cmpwi		0,%0,0\n\
> -	bne-		2f\n\
> -	stwcx.		%1,0,%2\n\
> -	bne-		1b\n"
> -	PPC_ACQUIRE_BARRIER
> -"2:"
> -	: "=&r" (tmp)
> -	: "r" (token), "r" (&lock->slock)
> -	: "cr0", "memory");
> +static __always_inline int __arch_spin_trylock(arch_spinlock_t *lock)
> +{
> +	arch_spinlock_t old, new;
>  
> -	return tmp;
> +	old.tickets = ACCESS_ONCE(lock->tickets);
> +	if (old.tickets.head != old.tickets.tail)
> +		return 0;
> +
> +	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
> +#if defined(CONFIG_PPC64)
> +	old.holder = lock->holder;
> +	new.holder = LOCK_TOKEN;
> +#endif
> +
> +	return !__arch_spin_cmpxchg_eq(lock, old, new);
>  }
>  
>  static inline int arch_spin_trylock(arch_spinlock_t *lock)
>  {
>  	CLEAR_IO_SYNC;
> -	return __arch_spin_trylock(lock) == 0;
> +	return  __arch_spin_trylock(lock);
>  }
>  
>  /*
> @@ -93,9 +135,8 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
>   * rest of our timeslice to the lock holder.
>   *
>   * So that we can tell which virtual processor is holding a lock,
> - * we put 0x80000000 | smp_processor_id() in the lock when it is
> - * held.  Conveniently, we have a word in the paca that holds this
> - * value.
> + * we put 0x80000000 | smp_processor_id() into lock->holder.
> + * Conveniently, we have a word in the paca that holds this value.
>   */
>  
>  #if defined(CONFIG_PPC_SPLPAR)
> @@ -109,19 +150,59 @@ extern void __rw_yield(arch_rwlock_t *lock);
>  #define SHARED_PROCESSOR	0
>  #endif
>  
> -static inline void arch_spin_lock(arch_spinlock_t *lock)
> +/*
> + * Ticket locks are conceptually two parts, one indicating the current head of
> + * the queue, and the other indicating the current tail. The lock is acquired
> + * by atomically noting the tail and incrementing it by one (thus adding
> + * ourself to the queue and noting our position), then waiting until the head
> + * becomes equal to the the initial value of the tail.
> + *
> + * We use an asm covering *both* parts of the lock, to increment the tail and
> + * also load the position of the head, which takes care of memory ordering
> + * issues and should be optimal for the uncontended case. Note the tail must be
> + * in the high part, because a wide add increment of the low part would carry
> + * up and contaminate the high part.
> + */
> +static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
>  {
> +	register struct __raw_tickets old, tmp,
> +		inc = { .tail = TICKET_LOCK_INC };
> +
>  	CLEAR_IO_SYNC;
> -	while (1) {
> -		if (likely(__arch_spin_trylock(lock) == 0))
> -			break;
> +	__asm__ __volatile__(
> +"1:	lwarx   %0,0,%4         # arch_spin_lock\n"
> +"	add     %1,%3,%0\n"
> +	PPC405_ERR77(0, "%4")
> +"	stwcx.  %1,0,%4\n"
> +"	bne-    1b"
> +	: "=&r" (old), "=&r" (tmp), "+m" (lock->tickets)
> +	: "r" (inc), "r" (&lock->tickets)
> +	: "cc");
> +
> +	if (likely(old.head == old.tail)) {
> +#if defined(CONFIG_PPC64)
> +		lock->holder = LOCK_TOKEN;
> +#endif
> +		goto out;
> +	}
> +
> +	for (;;) {
> +		unsigned count = 100;
> +
>  		do {
> +			if (ACCESS_ONCE(lock->tickets.head) == old.tail) {
> +#if defined(CONFIG_PPC64)
> +				lock->holder = LOCK_TOKEN;
> +#endif
> +				goto out;
> +			}
>  			HMT_low();
>  			if (SHARED_PROCESSOR)
>  				__spin_yield(lock);
> -		} while (unlikely(lock->slock != 0));
> +		} while (--count);
>  		HMT_medium();
>  	}
> +out:	barrier();	/* make sure nothing creeps before the lock is taken */
>  }
>  
>  static inline
> @@ -131,7 +211,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
>  
>  	CLEAR_IO_SYNC;
>  	while (1) {
> -		if (likely(__arch_spin_trylock(lock) == 0))
> +		if (likely(__arch_spin_trylock(lock)))
>  			break;
>  		local_save_flags(flags_dis);
>  		local_irq_restore(flags);
> @@ -139,7 +219,7 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
>  			HMT_low();
>  			if (SHARED_PROCESSOR)
>  				__spin_yield(lock);
> -		} while (unlikely(lock->slock != 0));
> +		} while (arch_spin_is_locked(lock));
>  		HMT_medium();
>  		local_irq_restore(flags_dis);
>  	}
> @@ -147,10 +227,22 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
>  
>  static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> +	arch_spinlock_t old, new;
> +
> +#if defined(CONFIG_PPC64)
> +	new.holder = 0;
> +#endif
> +	do {
> +		old.tickets = ACCESS_ONCE(lock->tickets);
> +#if defined(CONFIG_PPC64)
> +		old.holder = lock->holder;
> +#endif
> +		new.tickets.head = old.tickets.head + TICKET_LOCK_INC;
> +		new.tickets.tail = old.tickets.tail;
> +	} while (unlikely(__arch_spin_cmpxchg_eq(lock, old, new)));
>  	SYNC_IO;
>  	__asm__ __volatile__("# arch_spin_unlock\n\t"
>  				PPC_RELEASE_BARRIER: : :"memory");
> -	lock->slock = 0;
>  }
>  
>  #ifdef CONFIG_PPC64
> diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
> index 0c9c8d7..4a57e32 100644
> --- a/arch/powerpc/lib/locks.c
> +++ b/arch/powerpc/lib/locks.c
> @@ -27,7 +27,7 @@ void __spin_yield(arch_spinlock_t *lock)
>  {
>  	unsigned int lock_value, holder_cpu, yield_count;
>  
> -	lock_value = lock->slock;
> +	lock_value = lock->holder;
>  	if (lock_value == 0)
>  		return;
>  	holder_cpu = lock_value & 0xffff;
> @@ -36,7 +36,7 @@ void __spin_yield(arch_spinlock_t *lock)
>  	if ((yield_count & 1) == 0)
>  		return;		/* virtual cpu is currently running */
>  	rmb();
> -	if (lock->slock != lock_value)
> +	if (lock->holder != lock_value)
>  		return;		/* something has changed */
>  	plpar_hcall_norets(H_CONFER,
>  		get_hard_smp_processor_id(holder_cpu), yield_count);
> @@ -70,7 +70,7 @@ void __rw_yield(arch_rwlock_t *rw)
>  
>  void arch_spin_unlock_wait(arch_spinlock_t *lock)
>  {
> -	while (lock->slock) {
> +	while (arch_spin_is_locked(lock)) {
>  		HMT_low();
>  		if (SHARED_PROCESSOR)
>  			__spin_yield(lock);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
Peter Zijlstra Feb. 6, 2014, 4:38 p.m. UTC | #2
On Thu, Feb 06, 2014 at 11:37:37AM +0100, Torsten Duwe wrote:
> x86 has them, MIPS has them, ARM has them, even ia64 has them:
> ticket locks. They reduce memory bus and cache pressure especially
> for contended spinlocks, increasing performance.
> 
> This patch is a port of the x86 spin locks, mostly written in C,
> to the powerpc, introducing inline asm where needed. The pSeries
> directed yield for vCPUs is taken care of by an additional "holder"
> field in the lock.
> 

A few questions; what's with the ppc64 holder thing? Not having a 32bit
spinlock_t is sad.

Can you pair lwarx with sthcx ? I couldn't immediately find the answer
in the PowerISA doc. If so I think you can do better by being able to
atomically load both tickets but only storing the head without affecting
the tail.

In that case you can avoid the ll/sc on unlock, because only the lock
owner can modify the tail, so you can use a single half-word store.
Torsten Duwe Feb. 6, 2014, 5:37 p.m. UTC | #3
On Thu, Feb 06, 2014 at 05:38:37PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 06, 2014 at 11:37:37AM +0100, Torsten Duwe wrote:
> > x86 has them, MIPS has them, ARM has them, even ia64 has them:
> > ticket locks. They reduce memory bus and cache pressure especially
> > for contended spinlocks, increasing performance.
> > 
> > This patch is a port of the x86 spin locks, mostly written in C,
> > to the powerpc, introducing inline asm where needed. The pSeries
> > directed yield for vCPUs is taken care of by an additional "holder"
> > field in the lock.
> > 
> 
> A few questions; what's with the ppc64 holder thing? Not having a 32bit
> spinlock_t is sad.

I must admit that I haven't tested the patch on non-pseries ppc64 nor on
ppc32. Only ppc64 has the ldarx and I tried to atomically replace the 
holder along with the locks. That might prove unneccessary.

> Can you pair lwarx with sthcx ? I couldn't immediately find the answer
> in the PowerISA doc. If so I think you can do better by being able to
> atomically load both tickets but only storing the head without affecting
> the tail.

V2.06b, Book II, Chapter 3, "sthcx" says:
| If a reservation exists and the length associated [...] is not 2 bytes,
| it is undefined whether (RS)_48:63 are stored [...]

That doesn't make me feel comfortable :(

	Torsten
Peter Zijlstra Feb. 6, 2014, 6:08 p.m. UTC | #4
On Thu, Feb 06, 2014 at 06:37:27PM +0100, Torsten Duwe wrote:
> On Thu, Feb 06, 2014 at 05:38:37PM +0100, Peter Zijlstra wrote:
> > On Thu, Feb 06, 2014 at 11:37:37AM +0100, Torsten Duwe wrote:
> > > x86 has them, MIPS has them, ARM has them, even ia64 has them:
> > > ticket locks. They reduce memory bus and cache pressure especially
> > > for contended spinlocks, increasing performance.
> > > 
> > > This patch is a port of the x86 spin locks, mostly written in C,
> > > to the powerpc, introducing inline asm where needed. The pSeries
> > > directed yield for vCPUs is taken care of by an additional "holder"
> > > field in the lock.
> > > 
> > 
> > A few questions; what's with the ppc64 holder thing? Not having a 32bit
> > spinlock_t is sad.
> 
> I must admit that I haven't tested the patch on non-pseries ppc64 nor on
> ppc32. Only ppc64 has the ldarx and I tried to atomically replace the 
> holder along with the locks. That might prove unneccessary.

But what is the holder for? Can't we do away with that field?

> > Can you pair lwarx with sthcx ? I couldn't immediately find the answer
> > in the PowerISA doc. If so I think you can do better by being able to
> > atomically load both tickets but only storing the head without affecting
> > the tail.
> 
> V2.06b, Book II, Chapter 3, "sthcx" says:
> | If a reservation exists and the length associated [...] is not 2 bytes,
> | it is undefined whether (RS)_48:63 are stored [...]
> 
> That doesn't make me feel comfortable :(

That's on page 692, right? The way I read that is of the lharx/sthcx
don't have the exact same address, storage is undefined. But I can't
find mention of non-matching load and store size, although I can imagine
it being the same undefined.
Tom Musta Feb. 6, 2014, 7:28 p.m. UTC | #5
On 2/6/2014 12:08 PM, Peter Zijlstra wrote:
>>> Can you pair lwarx with sthcx ? I couldn't immediately find the answer
>>> > > in the PowerISA doc. If so I think you can do better by being able to
>>> > > atomically load both tickets but only storing the head without affecting
>>> > > the tail.
>> > 
>> > V2.06b, Book II, Chapter 3, "sthcx" says:
>> > | If a reservation exists and the length associated [...] is not 2 bytes,
>> > | it is undefined whether (RS)_48:63 are stored [...]
>> > 
>> > That doesn't make me feel comfortable :(
> That's on page 692, right? The way I read that is of the lharx/sthcx
> don't have the exact same address, storage is undefined. But I can't
> find mention of non-matching load and store size, although I can imagine
> it being the same undefined.

My read is consistent with Torsten's ... this looks like a bad idea.

Look at the RTL for sthcx. on page 692 (Power ISA V2.06) and you will see this:

if RESERVE then
  if RESERVE_LENGTH = 2 then
     ...
  else
     undefined_case <- 1
else
  ...

A legal implementation might never perform the store.
Scott Wood Feb. 6, 2014, 8:19 p.m. UTC | #6
On Thu, 2014-02-06 at 18:37 +0100, Torsten Duwe wrote:
> On Thu, Feb 06, 2014 at 05:38:37PM +0100, Peter Zijlstra wrote:
> > On Thu, Feb 06, 2014 at 11:37:37AM +0100, Torsten Duwe wrote:
> > > x86 has them, MIPS has them, ARM has them, even ia64 has them:
> > > ticket locks. They reduce memory bus and cache pressure especially
> > > for contended spinlocks, increasing performance.
> > > 
> > > This patch is a port of the x86 spin locks, mostly written in C,
> > > to the powerpc, introducing inline asm where needed. The pSeries
> > > directed yield for vCPUs is taken care of by an additional "holder"
> > > field in the lock.
> > > 
> > 
> > A few questions; what's with the ppc64 holder thing? Not having a 32bit
> > spinlock_t is sad.
> 
> I must admit that I haven't tested the patch on non-pseries ppc64 nor on
> ppc32. Only ppc64 has the ldarx and I tried to atomically replace the 
> holder along with the locks. That might prove unneccessary.

Why is the functionality of holder only required on 64-bit?  We have too
many 32/64 differences as is.  Perhaps on 32-bit a lower max number of
CPUs could be assumed, to make it fit in one word.

> > Can you pair lwarx with sthcx ? I couldn't immediately find the answer
> > in the PowerISA doc. If so I think you can do better by being able to
> > atomically load both tickets but only storing the head without affecting
> > the tail.
> 
> V2.06b, Book II, Chapter 3, "sthcx" says:
> | If a reservation exists and the length associated [...] is not 2 bytes,
> | it is undefined whether (RS)_48:63 are stored [...]
> 
> That doesn't make me feel comfortable :(

Plus, sthcx doesn't exist on all PPC chips.

-Scott
Torsten Duwe Feb. 7, 2014, 8:24 a.m. UTC | #7
On Thu, Feb 06, 2014 at 07:08:26PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 06, 2014 at 06:37:27PM +0100, Torsten Duwe wrote:
> > I must admit that I haven't tested the patch on non-pseries ppc64 nor on
> > ppc32. Only ppc64 has the ldarx and I tried to atomically replace the 
> > holder along with the locks. That might prove unneccessary.
> 
> But what is the holder for? Can't we do away with that field?

Scott, Peter: good questions.
The conditional is wrong because I confused pSeries with ppc64 CPUs with
64-bit kernels. I got deluded by the LOCK_TOKEN definition above. Is that
correctly ifdef'd, with PPC64? The holder field should be ifdef'd
CONFIG_PPC_SPLPAR, independent of ppc64.

It is an advisory performance hint, and doesn't need to be updated atomically
with the lock; this and the above are 2 reasons to drop the asm string
operand size voodoo as well.

Thanks,
	Torsten
Torsten Duwe Feb. 7, 2014, 9:02 a.m. UTC | #8
On Thu, Feb 06, 2014 at 02:19:52PM -0600, Scott Wood wrote:
> On Thu, 2014-02-06 at 18:37 +0100, Torsten Duwe wrote:
> > On Thu, Feb 06, 2014 at 05:38:37PM +0100, Peter Zijlstra wrote:
> 
> > > Can you pair lwarx with sthcx ? I couldn't immediately find the answer
> > > in the PowerISA doc. If so I think you can do better by being able to
> > > atomically load both tickets but only storing the head without affecting
> > > the tail.

Can I simply write the half word, without a reservation, or will the HW caches
mess up the other half? Will it ruin the cache coherency on some (sub)architectures?

> Plus, sthcx doesn't exist on all PPC chips.

Which ones are lacking it? Do all have at least a simple 16-bit store?

	Torsten
Peter Zijlstra Feb. 7, 2014, 10:31 a.m. UTC | #9
On Fri, Feb 07, 2014 at 10:02:48AM +0100, Torsten Duwe wrote:
> On Thu, Feb 06, 2014 at 02:19:52PM -0600, Scott Wood wrote:
> > On Thu, 2014-02-06 at 18:37 +0100, Torsten Duwe wrote:
> > > On Thu, Feb 06, 2014 at 05:38:37PM +0100, Peter Zijlstra wrote:
> > 
> > > > Can you pair lwarx with sthcx ? I couldn't immediately find the answer
> > > > in the PowerISA doc. If so I think you can do better by being able to
> > > > atomically load both tickets but only storing the head without affecting
> > > > the tail.
> 
> Can I simply write the half word, without a reservation, or will the HW caches
> mess up the other half? Will it ruin the cache coherency on some (sub)architectures?

So if you have ll/sc on the whole word concurrent with the half-word
store, you can loose the half-word store like:

  lwarx &tickets
  ...			sth &tail
  stwcd &tickets


The stwcd will over-write the tail store.

Anyway, what might work is something like (please forgive my ppc asm, I
can barely read the thing, I've never before attempted writing it):

lock:
1:	lharx	%0, 0, &head
	mov	%1, %0
	addic	%0, %0, 1
	stwcd   %0, 0, &head
	bne-	1b

2:	lhax	%0, 0, &tail
	lwsync
	cmp	0, %0, %0
	bne-	2b


unlock:
	lhz	%0, 0, &tail
	addic	%0, %0, 1
	lwsync
	sth	%0, 0, &tail


Which would somewhat translate into C as:

static inline void ticket_spin_lock(tickets_t *lock)
{
	ticket_t mine = xadd(&lock->head);

	while (smp_load_acquire(&lock->tail) != mine)
		cpu_relax();
}

static inline void ticket_spin_unlock(tickets_t *lock)
{
	ticket_t tail = lock->tail + 1;

	smp_store_release(&lock->tail, tail);
}

Where xadd() returns the value before addition and we assume half word
single-copy atomicy, such that the head and tail updates will not
interfere.

The x86 implementation uses the 32bit xadd and places the head at the
MSB end to get the atomic add + tail load in a single instruction, but
for PPC its much better to have an extra load (to an already hot
cacheline) and avoid a second ll/sc pair, as the ll/sc things are stupid
slow for your arch afaik.
Peter Zijlstra Feb. 7, 2014, 10:36 a.m. UTC | #10
> So if you have ll/sc on the whole word concurrent with the half-word
> store, you can loose the half-word store like:
> 
>   lwarx &tickets
>   ...			sth &tail
>   stwcd &tickets
> 
> 
> The stwcd will over-write the tail store.

Oh wait, that's stupid, it will invalidate the lock and fail the store
and make it try again, so you could try and combine the load, but you'd
need an extra shift instruction instead of an extra load.

Not sure that's a valid trade-off..
Peter Zijlstra Feb. 7, 2014, 10:45 a.m. UTC | #11
On Fri, Feb 07, 2014 at 11:31:39AM +0100, Peter Zijlstra wrote:
> Anyway, what might work is something like (please forgive my ppc asm, I
> can barely read the thing, I've never before attempted writing it):
> 
> lock:
> 1:	lharx	%0, 0, &head
> 	mov	%1, %0
> 	addic	%0, %0, 1
> 	stwcd   %0, 0, &head
> 	bne-	1b
> 2:	lhax	%0, 0, &tail

That might need to be lhz too, I'm confused on all the load variants.

> 	lwsync
> 	cmp	0, %0, %0

	cmp	0, %0, %1

So we compare the &tail load to the xadd return %1 above.

> 	bne-	2b
> 
> 
> unlock:
> 	lhz	%0, 0, &tail
> 	addic	%0, %0, 1
> 	lwsync
> 	sth	%0, 0, &tail
>
Torsten Duwe Feb. 7, 2014, 11:49 a.m. UTC | #12
On Fri, Feb 07, 2014 at 11:45:30AM +0100, Peter Zijlstra wrote:
> 
> That might need to be lhz too, I'm confused on all the load variants.

;-)

> > unlock:
> > 	lhz	%0, 0, &tail
> > 	addic	%0, %0, 1

No carry with this one, I'd say.
Besides, unlock increments the head.

> > 	lwsync
> > 	sth	%0, 0, &tail
> > 

Given the beauty and simplicity of this, may I ask Ingo:
you signed off 314cdbefd1fd0a7acf3780e9628465b77ea6a836;
can you explain why head and tail must live on the same cache
line? Or is it just a space saver? I just ported it to ppc,
I didn't think about alternatives.

What about

atomic_t tail;
volatile int head; ?

Admittedly, that's usually 8 bytes instead of 4...

	Torsten
Peter Zijlstra Feb. 7, 2014, 12:28 p.m. UTC | #13
On Fri, Feb 07, 2014 at 12:49:49PM +0100, Torsten Duwe wrote:
> On Fri, Feb 07, 2014 at 11:45:30AM +0100, Peter Zijlstra wrote:
> > 
> > That might need to be lhz too, I'm confused on all the load variants.
> 
> ;-)
> 
> > > unlock:
> > > 	lhz	%0, 0, &tail
> > > 	addic	%0, %0, 1
> 
> No carry with this one, I'd say.

Right you are, add immediate it is.

> Besides, unlock increments the head.

No, unlock increments the tail, lock increments the head and waits until
the tail matches the pre-inc value.

That said, why do the atomic_inc() primitives do an carry add? (that's
where I borrowed it from).

> > > 	lwsync
> > > 	sth	%0, 0, &tail
> > > 
> 
> Given the beauty and simplicity of this, may I ask Ingo:
> you signed off 314cdbefd1fd0a7acf3780e9628465b77ea6a836;
> can you explain why head and tail must live on the same cache
> line? Or is it just a space saver? I just ported it to ppc,
> I didn't think about alternatives.

spinlock_t should, ideally, be 32bits.

> What about
> 
> atomic_t tail;
> volatile int head; ?
> 
> Admittedly, that's usually 8 bytes instead of 4...

That still won't straddle a cacheline unless you do weird alignement
things which will bloat all the various data structures more still.

Anyway, you can do a version with lwarx/stwcx if you're looking get rid
of lharx.
Peter Zijlstra Feb. 7, 2014, 3:18 p.m. UTC | #14
On Fri, Feb 07, 2014 at 01:28:37PM +0100, Peter Zijlstra wrote:
> Anyway, you can do a version with lwarx/stwcx if you're looking get rid
> of lharx.

the below seems to compile into relatively ok asm. It can be done better
if you write the entire thing by hand though.

---

typedef unsigned short ticket_t;

typedef struct {
	union {
		unsigned int pair;
		struct {
			/* ensure @head is the MSB */
#ifdef __BIG_ENDIAN__
			ticket_t head,tail;
#else
			ticket_t tail,head;
#endif
		};
	};
} tickets_t;

#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))

#define barrier()	__asm__ __volatile__ ("" : : :"memory")
#define __lwsync()	__asm__ __volatile__ ("lwsync" : : :"memory")

#define smp_store_release(p, v)                                         \
do {                                                                    \
        __lwsync();                                                     \
        ACCESS_ONCE(*p) = (v);                                          \
} while (0)

#define smp_load_acquire(p)                                             \
({                                                                      \
        typeof(*p) ___p1 = ACCESS_ONCE(*p);                             \
        __lwsync();                                                     \
        ___p1;                                                          \
})

#define likely(x)	__builtin_expect(!!(x), 1)

#define cpu_relax()	barrier();

static inline unsigned int xadd(unsigned int *v, unsigned int i)
{
	int t, ret;
	
	__asm__ __volatile__ (
"1:	lwarx	%0, 0, %4\n"
"	mr	%1, %0\n"
"	add	%0, %3, %0\n"
"	stwcx.	%0, %0, %4\n"
"	bne-	1b\n"
	: "=&r" (t), "=&r" (ret), "+m" (*v)
	: "r" (i), "r" (v)
	: "cc");

	return ret;
}

void ticket_lock(tickets_t *lock)
{
	tickets_t t;

	/*
	 * Because @head is MSB, the direct increment wrap doesn't disturb
	 * @tail.
	 */
	t.pair = xadd(&lock->pair, 1<<16);

	if (likely(t.head == t.tail)) {
		__lwsync(); /* acquire */
		return;
	}

	while (smp_load_acquire(&lock->tail) != t.tail)
		cpu_relax();
}

void ticket_unlock(tickets_t *lock)
{
	ticket_t tail = lock->tail + 1;

	/*
	 * The store is save against the xadd for it will make the ll/sc fail
	 * and try again. Aside from that PowerISA guarantees single-copy
	 * atomicy for half-word writes.
	 *
	 * And since only the lock owner will ever write the tail, we're good.
	 */
	smp_store_release(&lock->tail, tail);
}
Peter Zijlstra Feb. 7, 2014, 3:43 p.m. UTC | #15
On Fri, Feb 07, 2014 at 04:18:47PM +0100, Peter Zijlstra wrote:
> void ticket_lock(tickets_t *lock)
> {
> 	tickets_t t;
> 
> 	/*
> 	 * Because @head is MSB, the direct increment wrap doesn't disturb
> 	 * @tail.
> 	 */
> 	t.pair = xadd(&lock->pair, 1<<16);
> 
> 	if (likely(t.head == t.tail)) {
> 		__lwsync(); /* acquire */
> 		return;
> 	}
> 
> 	while (smp_load_acquire(&lock->tail) != t.tail)
> 		cpu_relax();

That should be "!= t.head", for that contains our ticket.

I'm hopelessly scatter brained today it seems :/


> }
> 
> void ticket_unlock(tickets_t *lock)
> {
> 	ticket_t tail = lock->tail + 1;
> 
> 	/*
> 	 * The store is save against the xadd for it will make the ll/sc fail
> 	 * and try again. Aside from that PowerISA guarantees single-copy
> 	 * atomicy for half-word writes.
> 	 *
> 	 * And since only the lock owner will ever write the tail, we're good.
> 	 */
> 	smp_store_release(&lock->tail, tail);
> }
Kumar Gala Feb. 7, 2014, 3:51 p.m. UTC | #16
On Feb 7, 2014, at 3:02 AM, Torsten Duwe <duwe@lst.de> wrote:

> On Thu, Feb 06, 2014 at 02:19:52PM -0600, Scott Wood wrote:
>> On Thu, 2014-02-06 at 18:37 +0100, Torsten Duwe wrote:
>>> On Thu, Feb 06, 2014 at 05:38:37PM +0100, Peter Zijlstra wrote:
>> 
>>>> Can you pair lwarx with sthcx ? I couldn't immediately find the answer
>>>> in the PowerISA doc. If so I think you can do better by being able to
>>>> atomically load both tickets but only storing the head without affecting
>>>> the tail.
> 
> Can I simply write the half word, without a reservation, or will the HW caches
> mess up the other half? Will it ruin the cache coherency on some (sub)architectures?

The coherency should be fine, I just can’t remember if you’ll lose the reservation by doing this.

>> Plus, sthcx doesn't exist on all PPC chips.
> 
> Which ones are lacking it? Do all have at least a simple 16-bit store?

Everything implements a simple 16-bit store, just not everything implements the store conditional of 16-bit data.

- k
Peter Zijlstra Feb. 7, 2014, 4:10 p.m. UTC | #17
On Fri, Feb 07, 2014 at 09:51:16AM -0600, Kumar Gala wrote:
> 
> On Feb 7, 2014, at 3:02 AM, Torsten Duwe <duwe@lst.de> wrote:
> 
> > On Thu, Feb 06, 2014 at 02:19:52PM -0600, Scott Wood wrote:
> >> On Thu, 2014-02-06 at 18:37 +0100, Torsten Duwe wrote:
> >>> On Thu, Feb 06, 2014 at 05:38:37PM +0100, Peter Zijlstra wrote:
> >> 
> >>>> Can you pair lwarx with sthcx ? I couldn't immediately find the answer
> >>>> in the PowerISA doc. If so I think you can do better by being able to
> >>>> atomically load both tickets but only storing the head without affecting
> >>>> the tail.
> > 
> > Can I simply write the half word, without a reservation, or will the HW caches
> > mess up the other half? Will it ruin the cache coherency on some (sub)architectures?
> 
> The coherency should be fine, I just can’t remember if you’ll lose the reservation by doing this.

It should; I suppose; seeing how you 'destroy' the state it got from the
load.

> >> Plus, sthcx doesn't exist on all PPC chips.
> > 
> > Which ones are lacking it? Do all have at least a simple 16-bit store?
> 
> Everything implements a simple 16-bit store, just not everything implements the store conditional of 16-bit data.

Ok, so then the last version I posted should work on those machines.
Torsten Duwe Feb. 7, 2014, 5:08 p.m. UTC | #18
On Fri, Feb 07, 2014 at 04:18:47PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 07, 2014 at 01:28:37PM +0100, Peter Zijlstra wrote:
> > Anyway, you can do a version with lwarx/stwcx if you're looking get rid
> > of lharx.
> 
> the below seems to compile into relatively ok asm. It can be done better
> if you write the entire thing by hand though.

[...]

> 
> static inline unsigned int xadd(unsigned int *v, unsigned int i)
> {
> 	int t, ret;
> 	
> 	__asm__ __volatile__ (
> "1:	lwarx	%0, 0, %4\n"
> "	mr	%1, %0\n"
> "	add	%0, %3, %0\n"
> "	stwcx.	%0, %0, %4\n"
> "	bne-	1b\n"
> 	: "=&r" (t), "=&r" (ret), "+m" (*v)
> 	: "r" (i), "r" (v)
> 	: "cc");
> 
> 	return ret;
> }
> 
I don't like this xadd thing -- it's so x86 ;)
x86 has its LOCK prefix, ppc has ll/sc.
That should be reflected somehow IMHO.

Maybe if xadd became mandatory for some kernel library.

> 
> void ticket_unlock(tickets_t *lock)
> {
> 	ticket_t tail = lock->tail + 1;
> 
> 	/*
> 	 * The store is save against the xadd for it will make the ll/sc fail
> 	 * and try again. Aside from that PowerISA guarantees single-copy
> 	 * atomicy for half-word writes.
> 	 *
> 	 * And since only the lock owner will ever write the tail, we're good.
> 	 */
> 	smp_store_release(&lock->tail, tail);
> }

Yeah, let's try that on top of v2 (just posted).
First, I want to see v2 work as nicely as v1 --
compiling a debug kernel takes a while...

	Torsten
Peter Zijlstra Feb. 7, 2014, 5:19 p.m. UTC | #19
On Fri, Feb 07, 2014 at 06:08:45PM +0100, Torsten Duwe wrote:
> > static inline unsigned int xadd(unsigned int *v, unsigned int i)
> > {
> > 	int t, ret;
> > 	
> > 	__asm__ __volatile__ (
> > "1:	lwarx	%0, 0, %4\n"
> > "	mr	%1, %0\n"
> > "	add	%0, %3, %0\n"
> > "	stwcx.	%0, %0, %4\n"
> > "	bne-	1b\n"
> > 	: "=&r" (t), "=&r" (ret), "+m" (*v)
> > 	: "r" (i), "r" (v)
> > 	: "cc");
> > 
> > 	return ret;
> > }
> > 
> I don't like this xadd thing -- it's so x86 ;)
> x86 has its LOCK prefix, ppc has ll/sc.
> That should be reflected somehow IMHO.

Its the operational semantics I care about; this version is actually
nicer in that it doesn't actually imply all sorts of barriers :-)

> Maybe if xadd became mandatory for some kernel library.

call it fetch_add() its not an uncommon operation and many people
understand the semantics.

But you can simply include the asm bits in ticket_lock() and be done
with it. In that case you can also replace the add with an addi which
might be a little more efficient.

> > void ticket_unlock(tickets_t *lock)
> > {
> > 	ticket_t tail = lock->tail + 1;
> > 
> > 	/*
> > 	 * The store is save against the xadd for it will make the ll/sc fail
> > 	 * and try again. Aside from that PowerISA guarantees single-copy
> > 	 * atomicy for half-word writes.
> > 	 *
> > 	 * And since only the lock owner will ever write the tail, we're good.
> > 	 */
> > 	smp_store_release(&lock->tail, tail);
> > }
> 
> Yeah, let's try that on top of v2 (just posted).
> First, I want to see v2 work as nicely as v1 --
> compiling a debug kernel takes a while...

Use a faster machine... it can be done < 1 minute :-)
Benjamin Herrenschmidt Feb. 10, 2014, 2:54 a.m. UTC | #20
On Thu, 2014-02-06 at 13:28 -0600, Tom Musta wrote:
> My read is consistent with Torsten's ... this looks like a bad idea.
> 
> Look at the RTL for sthcx. on page 692 (Power ISA V2.06) and you will
> see this:
> 
> if RESERVE then
>   if RESERVE_LENGTH = 2 then
>      ...
>   else
>      undefined_case <- 1
> else
>   ...
> 
> A legal implementation might never perform the store.

This is an area where we definitely want to check with the implementors
and if the implementations happen to do what we want (they likely do),
get the architecture changed for future chips and use it anyway.

There's a a *significant* benefit in avoiding an atomic operation in the
unlock case .

The reservation mechanism being based on a granule that is generally a
cache line, I doubt implementations will ever check the actual access
size, but we need to double check.

Cheers,
Ben.
Benjamin Herrenschmidt Feb. 10, 2014, 3:02 a.m. UTC | #21
On Fri, 2014-02-07 at 10:02 +0100, Torsten Duwe wrote:
> > > > Can you pair lwarx with sthcx ? I couldn't immediately find the answer
> > > > in the PowerISA doc. If so I think you can do better by being able to
> > > > atomically load both tickets but only storing the head without affecting
> > > > the tail.
> 
> Can I simply write the half word, without a reservation, or will the HW caches
> mess up the other half? Will it ruin the cache coherency on some (sub)architectures?

Yes, you can, I *think*

> > Plus, sthcx doesn't exist on all PPC chips.
> 
> Which ones are lacking it? Do all have at least a simple 16-bit store?

half word atomics (and byte atomics) are new, they've been added in architecture
2.06 I believe so it's fairly recent, but it's still worthwhile to investigate a
way to avoid atomics on unlock on recent processors (we can use instruction patching
if necessary based on CPU features) because there's definitely a significant cost
in doing a larx/stcx. sequence on powerpc, way higher than our current unlock path
of barrier + store.

Cheers,
Ben.
Benjamin Herrenschmidt Feb. 10, 2014, 3:05 a.m. UTC | #22
On Fri, 2014-02-07 at 09:51 -0600, Kumar Gala wrote:
> On Feb 7, 2014, at 3:02 AM, Torsten Duwe <duwe@lst.de> wrote:
> 
> > On Thu, Feb 06, 2014 at 02:19:52PM -0600, Scott Wood wrote:
> >> On Thu, 2014-02-06 at 18:37 +0100, Torsten Duwe wrote:
> >>> On Thu, Feb 06, 2014 at 05:38:37PM +0100, Peter Zijlstra wrote:
> >> 
> >>>> Can you pair lwarx with sthcx ? I couldn't immediately find the answer
> >>>> in the PowerISA doc. If so I think you can do better by being able to
> >>>> atomically load both tickets but only storing the head without affecting
> >>>> the tail.
> > 
> > Can I simply write the half word, without a reservation, or will the HW caches
> > mess up the other half? Will it ruin the cache coherency on some (sub)architectures?
> 
> The coherency should be fine, I just can’t remember if you’ll lose the reservation by doing this.

Yes you do.

> >> Plus, sthcx doesn't exist on all PPC chips.
> > 
> > Which ones are lacking it? Do all have at least a simple 16-bit store?
> 
> Everything implements a simple 16-bit store, just not everything implements the store conditional of 16-bit data.

Ben.

> - k--
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 2351adc..64a98be 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -5,11 +5,30 @@ 
 # error "please don't include this file directly"
 #endif
 
-typedef struct {
-	volatile unsigned int slock;
-} arch_spinlock_t;
+typedef u16 __ticket_t;
+typedef u32 __ticketpair_t;
+
+#define TICKET_LOCK_INC	((__ticket_t)1)
+
+#define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
+
+typedef struct arch_spinlock {
+	union {
+		__ticketpair_t head_tail;
+		struct __raw_tickets {
+#ifdef __BIG_ENDIAN__		/* The "tail" part should be in the MSBs */
+			__ticket_t tail, head;
+#else
+			__ticket_t head, tail;
+#endif
+		} tickets;
+	};
+#if defined(CONFIG_PPC64)
+	u32 holder;
+#endif
+} arch_spinlock_t __aligned(8);
 
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 }, 0 }
 
 typedef struct {
 	volatile signed int lock;
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 5f54a74..6e33691 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -9,8 +9,7 @@ 
  * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
  * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
  *	Rework to support virtual processors
- *
- * Type of int is used as a full 64b word is not necessary.
+ * Copyright (C) 2014 Torsten Duwe <duwe@suse.de>, ticket lock port
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -28,7 +27,20 @@ 
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 
-#define arch_spin_is_locked(x)		((x)->slock != 0)
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+	return tmp.tail != tmp.head;
+}
+
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
+{
+	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+}
+#define arch_spin_is_contended	arch_spin_is_contended
 
 #ifdef CONFIG_PPC64
 /* use 0x800000yy when locked, where yy == CPU number */
@@ -37,8 +49,12 @@ 
 #else
 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
 #endif
+#define SIZE_LETTER	"d"
+#define L64		"1"
 #else
 #define LOCK_TOKEN	1
+#define SIZE_LETTER	"w"
+#define L64		"0"
 #endif
 
 #if defined(CONFIG_PPC64) && defined(CONFIG_SMP)
@@ -55,33 +71,59 @@ 
 #endif
 
 /*
- * This returns the old value in the lock, so we succeeded
- * in getting the lock if the return value is 0.
+ * Our own cmpxchg, operating on spinlock_t's.  Returns 0 iff value
+ * read at lock was equal to "old" AND the cmpxchg succeeded
+ * uninterruptedly.
  */
-static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
+static __always_inline int __arch_spin_cmpxchg_eq(arch_spinlock_t *lock,
+					       arch_spinlock_t old,
+					       arch_spinlock_t new)
 {
-	unsigned long tmp, token;
+	register int retval = 1;
+	register arch_spinlock_t tmp;
+
+	__asm__ __volatile__ (
+"	li %0,1\n"		/* default to "fail" */
+	PPC_RELEASE_BARRIER
+"1:	l" SIZE_LETTER "arx   %2,0,%5         # __arch_spin_cmpxchg_eq\n"
+"	cmp 0,"L64",%3,%2\n"
+"	bne-    2f\n"
+	PPC405_ERR77(0, "%5")
+"	st" SIZE_LETTER "cx.  %4,0,%5\n"
+"	bne-    1b\n"
+"	isync\n"
+"	li %0,0\n"
+"2:"
+	: "=&r" (retval), "+m" (*lock)
+	: "r" (tmp), "r" (old), "r" (new), "r" (lock)
+	: "cc", "memory");
+
+	return retval;
+}
+#undef SIZE_LETTER
+#undef L64
 
-	token = LOCK_TOKEN;
-	__asm__ __volatile__(
-"1:	" PPC_LWARX(%0,0,%2,1) "\n\
-	cmpwi		0,%0,0\n\
-	bne-		2f\n\
-	stwcx.		%1,0,%2\n\
-	bne-		1b\n"
-	PPC_ACQUIRE_BARRIER
-"2:"
-	: "=&r" (tmp)
-	: "r" (token), "r" (&lock->slock)
-	: "cr0", "memory");
+static __always_inline int __arch_spin_trylock(arch_spinlock_t *lock)
+{
+	arch_spinlock_t old, new;
 
-	return tmp;
+	old.tickets = ACCESS_ONCE(lock->tickets);
+	if (old.tickets.head != old.tickets.tail)
+		return 0;
+
+	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+#if defined(CONFIG_PPC64)
+	old.holder = lock->holder;
+	new.holder = LOCK_TOKEN;
+#endif
+
+	return !__arch_spin_cmpxchg_eq(lock, old, new);
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
-	return __arch_spin_trylock(lock) == 0;
+	return  __arch_spin_trylock(lock);
 }
 
 /*
@@ -93,9 +135,8 @@  static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * rest of our timeslice to the lock holder.
  *
  * So that we can tell which virtual processor is holding a lock,
- * we put 0x80000000 | smp_processor_id() in the lock when it is
- * held.  Conveniently, we have a word in the paca that holds this
- * value.
+ * we put 0x80000000 | smp_processor_id() into lock->holder.
+ * Conveniently, we have a word in the paca that holds this value.
  */
 
 #if defined(CONFIG_PPC_SPLPAR)
@@ -109,19 +150,59 @@  extern void __rw_yield(arch_rwlock_t *lock);
 #define SHARED_PROCESSOR	0
 #endif
 
-static inline void arch_spin_lock(arch_spinlock_t *lock)
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an asm covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide add increment of the low part would carry
+ * up and contaminate the high part.
+ */
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 {
+	register struct __raw_tickets old, tmp,
+		inc = { .tail = TICKET_LOCK_INC };
+
 	CLEAR_IO_SYNC;
-	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
-			break;
+	__asm__ __volatile__(
+"1:	lwarx   %0,0,%4         # arch_spin_lock\n"
+"	add     %1,%3,%0\n"
+	PPC405_ERR77(0, "%4")
+"	stwcx.  %1,0,%4\n"
+"	bne-    1b"
+	: "=&r" (old), "=&r" (tmp), "+m" (lock->tickets)
+	: "r" (inc), "r" (&lock->tickets)
+	: "cc");
+
+	if (likely(old.head == old.tail)) {
+#if defined(CONFIG_PPC64)
+		lock->holder = LOCK_TOKEN;
+#endif
+		goto out;
+	}
+
+	for (;;) {
+		unsigned count = 100;
+
 		do {
+			if (ACCESS_ONCE(lock->tickets.head) == old.tail) {
+#if defined(CONFIG_PPC64)
+				lock->holder = LOCK_TOKEN;
+#endif
+				goto out;
+			}
 			HMT_low();
 			if (SHARED_PROCESSOR)
 				__spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
+		} while (--count);
 		HMT_medium();
 	}
+out:	barrier();	/* make sure nothing creeps before the lock is taken */
 }
 
 static inline
@@ -131,7 +211,7 @@  void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 
 	CLEAR_IO_SYNC;
 	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
+		if (likely(__arch_spin_trylock(lock)))
 			break;
 		local_save_flags(flags_dis);
 		local_irq_restore(flags);
@@ -139,7 +219,7 @@  void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 			HMT_low();
 			if (SHARED_PROCESSOR)
 				__spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
+		} while (arch_spin_is_locked(lock));
 		HMT_medium();
 		local_irq_restore(flags_dis);
 	}
@@ -147,10 +227,22 @@  void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
+	arch_spinlock_t old, new;
+
+#if defined(CONFIG_PPC64)
+	new.holder = 0;
+#endif
+	do {
+		old.tickets = ACCESS_ONCE(lock->tickets);
+#if defined(CONFIG_PPC64)
+		old.holder = lock->holder;
+#endif
+		new.tickets.head = old.tickets.head + TICKET_LOCK_INC;
+		new.tickets.tail = old.tickets.tail;
+	} while (unlikely(__arch_spin_cmpxchg_eq(lock, old, new)));
 	SYNC_IO;
 	__asm__ __volatile__("# arch_spin_unlock\n\t"
 				PPC_RELEASE_BARRIER: : :"memory");
-	lock->slock = 0;
 }
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 0c9c8d7..4a57e32 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -27,7 +27,7 @@  void __spin_yield(arch_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 
-	lock_value = lock->slock;
+	lock_value = lock->holder;
 	if (lock_value == 0)
 		return;
 	holder_cpu = lock_value & 0xffff;
@@ -36,7 +36,7 @@  void __spin_yield(arch_spinlock_t *lock)
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
-	if (lock->slock != lock_value)
+	if (lock->holder != lock_value)
 		return;		/* something has changed */
 	plpar_hcall_norets(H_CONFER,
 		get_hard_smp_processor_id(holder_cpu), yield_count);
@@ -70,7 +70,7 @@  void __rw_yield(arch_rwlock_t *rw)
 
 void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	while (lock->slock) {
+	while (arch_spin_is_locked(lock)) {
 		HMT_low();
 		if (SHARED_PROCESSOR)
 			__spin_yield(lock);