diff mbox

[v2] powerpc ticket locks

Message ID 20140207165801.GC2107@lst.de (mailing list archive)
State RFC
Headers show

Commit Message

Torsten Duwe Feb. 7, 2014, 4:58 p.m. UTC
Ticket locks for ppc, version 2. Changes since v1:
* The atomically exchanged entity is always 32 bits.
* asm inline string variations thus removed.
* Carry the additional holder hint only #if defined(CONFIG_PPC_SPLPAR)

Signed-off-by: Torsten Duwe <duwe@suse.de>
--
 arch/powerpc/include/asm/spinlock_types.h |   29 +++++
 arch/powerpc/include/asm/spinlock.h       |  146 +++++++++++++++++++++++-------
 arch/powerpc/lib/locks.c                  |    6 -
 3 files changed, 144 insertions(+), 37 deletions(-)

Comments

Peter Zijlstra Feb. 7, 2014, 5:12 p.m. UTC | #1
On Fri, Feb 07, 2014 at 05:58:01PM +0100, Torsten Duwe wrote:
> +static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
>  {
> +	register struct __raw_tickets old, tmp,
> +		inc = { .tail = TICKET_LOCK_INC };
> +
>  	CLEAR_IO_SYNC;
> +	__asm__ __volatile__(
> +"1:	lwarx	%0,0,%4		# arch_spin_lock\n"
> +"	add	%1,%3,%0\n"
> +	PPC405_ERR77(0, "%4")
> +"	stwcx.	%1,0,%4\n"
> +"	bne-	1b"
> +	: "=&r" (old), "=&r" (tmp), "+m" (lock->tickets)
> +	: "r" (inc), "r" (&lock->tickets)
> +	: "cc");
> +
> +	if (likely(old.head == old.tail))
> +		goto out;

I would have expected an lwsync someplace hereabouts.

> +	for (;;) {
> +		unsigned count = 100;
> +
>  		do {
> +			if (ACCESS_ONCE(lock->tickets.head) == old.tail)
> +				goto out;
>  			HMT_low();
>  			if (SHARED_PROCESSOR)
>  				__spin_yield(lock);
> +		} while (--count);
>  		HMT_medium();
>  	}
> +out:
> +#if defined(CONFIG_PPC_SPLPAR)
> +	lock->holder = LOCK_TOKEN;
> +#endif
> +	barrier();	/* make sure nothing creeps before the lock is taken */
>  }
>  
>  static inline

> @@ -147,10 +220,21 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
>  
>  static inline void arch_spin_unlock(arch_spinlock_t *lock)
>  {
> +	arch_spinlock_t old, new;
> +
> +#if defined(CONFIG_PPC_SPLPAR)
> +	lock->holder = 0;
> +#endif
> +	do {
> +		old.tickets = ACCESS_ONCE(lock->tickets);
> +		new.tickets.head = old.tickets.head + TICKET_LOCK_INC;
> +		new.tickets.tail = old.tickets.tail;
> +	} while (unlikely(__arch_spin_cmpxchg_eq(lock,
> +						 old.head_tail,
> +						 new.head_tail)));
>  	SYNC_IO;
>  	__asm__ __volatile__("# arch_spin_unlock\n\t"
>  				PPC_RELEASE_BARRIER: : :"memory");

Doens't your cmpxchg_eq not already imply a lwsync?

> -	lock->slock = 0;
>  }

I'm still failing to see why you need an ll/sc pair for unlock.
Torsten Duwe Feb. 7, 2014, 5:55 p.m. UTC | #2
On Fri, Feb 07, 2014 at 06:12:24PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 07, 2014 at 05:58:01PM +0100, Torsten Duwe wrote:
> > +static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
> >  {
> > +	register struct __raw_tickets old, tmp,
> > +		inc = { .tail = TICKET_LOCK_INC };
> > +
> >  	CLEAR_IO_SYNC;
> > +	__asm__ __volatile__(
> > +"1:	lwarx	%0,0,%4		# arch_spin_lock\n"
> > +"	add	%1,%3,%0\n"
> > +	PPC405_ERR77(0, "%4")
> > +"	stwcx.	%1,0,%4\n"
> > +"	bne-	1b"
> > +	: "=&r" (old), "=&r" (tmp), "+m" (lock->tickets)
> > +	: "r" (inc), "r" (&lock->tickets)
> > +	: "cc");
> > +
> > +	if (likely(old.head == old.tail))
> > +		goto out;
> 
> I would have expected an lwsync someplace hereabouts.

Let me reconsider this. The v1 code worked on an 8 core,
maybe I didn't beat it enough.

> >  static inline void arch_spin_unlock(arch_spinlock_t *lock)
> >  {
> > +	arch_spinlock_t old, new;
> > +
> > +#if defined(CONFIG_PPC_SPLPAR)
> > +	lock->holder = 0;
> > +#endif
> > +	do {
> > +		old.tickets = ACCESS_ONCE(lock->tickets);
> > +		new.tickets.head = old.tickets.head + TICKET_LOCK_INC;
> > +		new.tickets.tail = old.tickets.tail;
> > +	} while (unlikely(__arch_spin_cmpxchg_eq(lock,
> > +						 old.head_tail,
> > +						 new.head_tail)));
> >  	SYNC_IO;
> >  	__asm__ __volatile__("# arch_spin_unlock\n\t"
> >  				PPC_RELEASE_BARRIER: : :"memory");
> 
> Doens't your cmpxchg_eq not already imply a lwsync?

Right.

> > -	lock->slock = 0;
> >  }
> 
> I'm still failing to see why you need an ll/sc pair for unlock.

Like so:
static inline void arch_spin_unlock(arch_spinlock_t *lock)
{
	arch_spinlock_t tmp;

#if defined(CONFIG_PPC_SPLPAR)
	lock->holder = 0;
#endif
	tmp.tickets = ACCESS_ONCE(lock->tickets);
	tmp.tickets.head += TICKET_LOCK_INC;
	lock->tickets.head = tmp.tickets.head;
	SYNC_IO;
	__asm__ __volatile__("# arch_spin_unlock\n\t"
				PPC_RELEASE_BARRIER: : :"memory");
}
?

I'll wrap it all up next week. I only wanted to post an updated v2
with the agreed-upon changes for BenH.

Thanks so far!

	Torsten
Benjamin Herrenschmidt Feb. 10, 2014, 3:10 a.m. UTC | #3
On Fri, 2014-02-07 at 17:58 +0100, Torsten Duwe wrote:
>  typedef struct {
> -       volatile unsigned int slock;
> -} arch_spinlock_t;
> +       union {
> +               __ticketpair_t head_tail;
> +               struct __raw_tickets {
> +#ifdef __BIG_ENDIAN__          /* The "tail" part should be in the MSBs */
> +                       __ticket_t tail, head;
> +#else
> +                       __ticket_t head, tail;
> +#endif
> +               } tickets;
> +       };
> +#if defined(CONFIG_PPC_SPLPAR)
> +       u32 holder;
> +#endif
> +} arch_spinlock_t __aligned(4);

That's still broken with lockref (which we just merged).

We must have the arch_spinlock_t and the ref in the same 64-bit word
otherwise it will break.

We can make it work in theory since the holder doesn't have to be
accessed atomically, but the practicals are a complete mess ...
lockref would essentially have to re-implement the holder handling
of the spinlocks and use lower level ticket stuff.

Unless you can find a sneaky trick ... :-(

Ben.
Torsten Duwe Feb. 10, 2014, 3:52 p.m. UTC | #4
On Mon, Feb 10, 2014 at 02:10:23PM +1100, Benjamin Herrenschmidt wrote:
> On Fri, 2014-02-07 at 17:58 +0100, Torsten Duwe wrote:
> >  typedef struct {
> > -       volatile unsigned int slock;
> > -} arch_spinlock_t;
> > +       union {
> > +               __ticketpair_t head_tail;
> > +               struct __raw_tickets {
> > +#ifdef __BIG_ENDIAN__          /* The "tail" part should be in the MSBs */
> > +                       __ticket_t tail, head;
> > +#else
> > +                       __ticket_t head, tail;
> > +#endif
> > +               } tickets;
> > +       };
> > +#if defined(CONFIG_PPC_SPLPAR)
> > +       u32 holder;
> > +#endif
> > +} arch_spinlock_t __aligned(4);
> 
> That's still broken with lockref (which we just merged).
> 
> We must have the arch_spinlock_t and the ref in the same 64-bit word
> otherwise it will break.

Well, as far as I can see you'll just not be able to
USE_CMPXCHG_LOCKREF -- with the appropriate performance hit --
the code just falls back into lock&ref on pSeries.
What again was the intention of directed yield in the first place...?

> We can make it work in theory since the holder doesn't have to be
> accessed atomically, but the practicals are a complete mess ...
> lockref would essentially have to re-implement the holder handling
> of the spinlocks and use lower level ticket stuff.
> 
> Unless you can find a sneaky trick ... :-(

What if I squeeze the bits a little?
4k vCPUs, and 256 physical, as a limit to stay within 32 bits?
At the cost that unlock may become an ll/sc operation again.
I could think about a trick against that.
But alas, hw_cpu_id is 16 bit, which makes a lookup table neccessary :-/

Doing another round of yields for lockrefs now doesn't
sound so bad any more.

Opinions, anyone?

	Torsten
Peter Zijlstra Feb. 10, 2014, 5:53 p.m. UTC | #5
On Mon, Feb 10, 2014 at 04:52:17PM +0100, Torsten Duwe wrote:
> Opinions, anyone?

Since the holder thing is a performance thing, not a correctness thing;
one thing you could do is something like:

static const int OWNER_HASH_SIZE = CONFIG_NR_CPUS * 4;
static const int OWNER_HASH_BITS = ilog2(OWNER_HASH_SIZE);

u16 lock_owner_array[OWNER_HASH_SIZE] = { 0, };

void set_owner(struct arch_spinlock_t *lock, int owner)
{
	int hash = hash_ptr(lock, OWNER_HASH_BITS);
	lock_owner_array[hash] = owner;
}

void yield_to_owner(struct arch_spinlock_t *lock)
{
	int hash = hash_ptr(lock, OWNER_HASH_BITS);
	int owner = lock_owner_array[hash];
	yield_to_cpu(owner);
}

And call set_owner() after the ticket lock is acquired, and don't bother
clearing it again; a new acquire will overwrite, a collision we have to
live with.

It should on average get you the right yield and does away with having
to track the owner field in place.

It does however get you an extra cacheline miss on acquire :/

One could consider patching it out when you know your kernel is not
running on an overloaded partition.
Benjamin Herrenschmidt Feb. 11, 2014, 2:44 a.m. UTC | #6
(Linus, Al, a question for you down there about lockref "ref" size)

On Mon, 2014-02-10 at 16:52 +0100, Torsten Duwe wrote:

> What if I squeeze the bits a little?
> 4k vCPUs, and 256 physical, as a limit to stay within 32 bits?
> At the cost that unlock may become an ll/sc operation again.
> I could think about a trick against that.
> But alas, hw_cpu_id is 16 bit, which makes a lookup table neccessary :-/
> 
> Doing another round of yields for lockrefs now doesn't
> sound so bad any more.

So, the ticketpair has to be 16:16 so we can avoid the atomic on unlock

That leaves us with 32 bits to put the ref and the owner. The question
is how big the ref really has to be and can we have a reasonable failure
mode if it overflows ?

If we limit ourselves to, for example, 16-bit for the ref in lockref,
then we can have the second 32-bit split between the owner and the ref.

If we limit ourselves to 4k CPUs, then we get 4 more bits of ref ...

So the question is, is it reasonable to have the ref smaller than
32-bit...

Cheers,
Ben.
Al Viro Feb. 11, 2014, 2:56 a.m. UTC | #7
On Tue, Feb 11, 2014 at 01:44:20PM +1100, Benjamin Herrenschmidt wrote:
> That leaves us with 32 bits to put the ref and the owner. The question
> is how big the ref really has to be and can we have a reasonable failure
> mode if it overflows ?
> 
> If we limit ourselves to, for example, 16-bit for the ref in lockref,
> then we can have the second 32-bit split between the owner and the ref.
> 
> If we limit ourselves to 4k CPUs, then we get 4 more bits of ref ...
> 
> So the question is, is it reasonable to have the ref smaller than
> 32-bit...

Every time you open a file, you bump dentry refcount.  Something like
libc or ld.so will be opened on just about every execve(), so I'd say
that 16 bits is far too low.  If nothing else, 32 bits might be too
low on 64bit boxen...
Benjamin Herrenschmidt Feb. 11, 2014, 3:38 a.m. UTC | #8
On Tue, 2014-02-11 at 02:56 +0000, Al Viro wrote:
> > So the question is, is it reasonable to have the ref smaller than
> > 32-bit...
> 
> Every time you open a file, you bump dentry refcount.  Something like
> libc or ld.so will be opened on just about every execve(), so I'd say
> that 16 bits is far too low.  If nothing else, 32 bits might be too
> low on 64bit boxen...

So back to square 1 ... we can't implement together lockref, ticket
locks, and our lock confer mechanism within 64-bit.

I see two options at this stage. Both require a custom implementation
of lockref for powerpc, so some ifdef's such that we can replace the
generic implementation completely.

 - We can use a small ref, and when it's too big, overflow into a larger
one, falling back to the "old style" lock + ref (an overflow bit or a
compare with ffff)

 - We can have lockref "build" it's own lock out of the ticketpair and
ref, keeping the owner in a separate word. The owner doesn't strictly
need to be atomic.

Both are gross though :(

Anybody has a better idea ?

Ben.
Raghavendra KT Feb. 11, 2014, 9:39 a.m. UTC | #9
On Fri, Feb 7, 2014 at 10:28 PM, Torsten Duwe <duwe@lst.de> wrote:
> Ticket locks for ppc, version 2. Changes since v1:
> * The atomically exchanged entity is always 32 bits.
> * asm inline string variations thus removed.
> * Carry the additional holder hint only #if defined(CONFIG_PPC_SPLPAR)
>
> Signed-off-by: Torsten Duwe <duwe@suse.de>
> --
[...]
> +static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
>  {
> +       register struct __raw_tickets old, tmp,
> +               inc = { .tail = TICKET_LOCK_INC };
> +
>         CLEAR_IO_SYNC;
> -       while (1) {
> -               if (likely(__arch_spin_trylock(lock) == 0))
> -                       break;
> +       __asm__ __volatile__(
> +"1:    lwarx   %0,0,%4         # arch_spin_lock\n"
> +"      add     %1,%3,%0\n"
> +       PPC405_ERR77(0, "%4")
> +"      stwcx.  %1,0,%4\n"
> +"      bne-    1b"
> +       : "=&r" (old), "=&r" (tmp), "+m" (lock->tickets)
> +       : "r" (inc), "r" (&lock->tickets)
> +       : "cc");
> +
> +       if (likely(old.head == old.tail))
> +               goto out;
> +
> +       for (;;) {
> +               unsigned count = 100;

I am sure you wanted to tune the total loops to typical lock holding time ...
Raghavendra KT Feb. 11, 2014, 9:53 a.m. UTC | #10
On Mon, Feb 10, 2014 at 8:40 AM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Fri, 2014-02-07 at 17:58 +0100, Torsten Duwe wrote:
>>  typedef struct {
>> -       volatile unsigned int slock;
>> -} arch_spinlock_t;
>> +       union {
>> +               __ticketpair_t head_tail;
>> +               struct __raw_tickets {
>> +#ifdef __BIG_ENDIAN__          /* The "tail" part should be in the MSBs */
>> +                       __ticket_t tail, head;
>> +#else
>> +                       __ticket_t head, tail;
>> +#endif
>> +               } tickets;
>> +       };
>> +#if defined(CONFIG_PPC_SPLPAR)
>> +       u32 holder;
>> +#endif
>> +} arch_spinlock_t __aligned(4);
>
> That's still broken with lockref (which we just merged).
>
> We must have the arch_spinlock_t and the ref in the same 64-bit word
> otherwise it will break.
>
> We can make it work in theory since the holder doesn't have to be
> accessed atomically, but the practicals are a complete mess ...
> lockref would essentially have to re-implement the holder handling
> of the spinlocks and use lower level ticket stuff.
>

Probably very basic and stupid question from me.
How much important to have holder information for PPC? From my
previous experiment
on x86, it was lock-waiter preemption which is problematic rather than
lock-holder preemption.
Torsten Duwe Feb. 11, 2014, 10:40 a.m. UTC | #11
On Tue, Feb 11, 2014 at 03:23:51PM +0530, Raghavendra KT wrote:
> How much important to have holder information for PPC? From my
> previous experiment
> on x86, it was lock-waiter preemption which is problematic rather than
> lock-holder preemption.

It's something very special to IBM pSeries: the hypervisor can assign
fractions of physical CPUs to guests. Sometimes a guest with 4 quarter
CPUs will be faster than 1 monoprocessor. (correct me if I'm wrong).

The directed yield resolves the silly situation when holder and waiter
reside on the same physical CPU, as I understand it.

x86 has nothing comparable.

	Torsten
Scott Wood Feb. 11, 2014, 6:30 p.m. UTC | #12
On Tue, 2014-02-11 at 11:40 +0100, Torsten Duwe wrote:
> On Tue, Feb 11, 2014 at 03:23:51PM +0530, Raghavendra KT wrote:
> > How much important to have holder information for PPC? From my
> > previous experiment
> > on x86, it was lock-waiter preemption which is problematic rather than
> > lock-holder preemption.
> 
> It's something very special to IBM pSeries: the hypervisor can assign
> fractions of physical CPUs to guests. Sometimes a guest with 4 quarter
> CPUs will be faster than 1 monoprocessor. (correct me if I'm wrong).
> 
> The directed yield resolves the silly situation when holder and waiter
> reside on the same physical CPU, as I understand it.
> 
> x86 has nothing comparable.

How is this different from the very ordinary case of an SMP KVM guest
whose vcpus are not bound to host cpus, and thus you could have multiple
vcpus running on the same host cpu?

-Scott
Benjamin Herrenschmidt Feb. 11, 2014, 7:34 p.m. UTC | #13
On Tue, 2014-02-11 at 12:30 -0600, Scott Wood wrote:
> > It's something very special to IBM pSeries: the hypervisor can assign
> > fractions of physical CPUs to guests. Sometimes a guest with 4 quarter
> > CPUs will be faster than 1 monoprocessor. (correct me if I'm wrong).
> > 
> > The directed yield resolves the silly situation when holder and waiter
> > reside on the same physical CPU, as I understand it.
> > 
> > x86 has nothing comparable.
> 
> How is this different from the very ordinary case of an SMP KVM guest
> whose vcpus are not bound to host cpus, and thus you could have multiple
> vcpus running on the same host cpu?

It's not really ... though I can see drawbacks with the scheme as well
and I think in KVM we should be careful to only confer if the owner
vcpu last scheduled on the same physical cpu where the waiter is, other
wise, there's too much chances of us bouncing things around the machine
for minor contention cases.

Paul, what's your policy today ?

Cheers,
Ben.
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h
index 2351adc..fce1383 100644
--- a/arch/powerpc/include/asm/spinlock_types.h
+++ b/arch/powerpc/include/asm/spinlock_types.h
@@ -5,11 +5,34 @@ 
 # error "please don't include this file directly"
 #endif
 
+typedef u16 __ticket_t;
+typedef u32 __ticketpair_t;
+
+#define TICKET_LOCK_INC	((__ticket_t)1)
+
+#define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
+
 typedef struct {
-	volatile unsigned int slock;
-} arch_spinlock_t;
+	union {
+		__ticketpair_t head_tail;
+		struct __raw_tickets {
+#ifdef __BIG_ENDIAN__		/* The "tail" part should be in the MSBs */
+			__ticket_t tail, head;
+#else
+			__ticket_t head, tail;
+#endif
+		} tickets;
+	};
+#if defined(CONFIG_PPC_SPLPAR)
+	u32 holder;
+#endif
+} arch_spinlock_t __aligned(4);
 
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#if defined(CONFIG_PPC_SPLPAR)
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 }, 0 }
+#else
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
+#endif
 
 typedef struct {
 	volatile signed int lock;
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 5f54a74..a931514 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -9,8 +9,7 @@ 
  * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
  * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
  *	Rework to support virtual processors
- *
- * Type of int is used as a full 64b word is not necessary.
+ * Copyright (C) 2014 Torsten Duwe <duwe@suse.de>, ticket lock port
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -28,7 +27,20 @@ 
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 
-#define arch_spin_is_locked(x)		((x)->slock != 0)
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+	return tmp.tail != tmp.head;
+}
+
+static inline int arch_spin_is_contended(arch_spinlock_t *lock)
+{
+	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+
+	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+}
+#define arch_spin_is_contended	arch_spin_is_contended
 
 #ifdef CONFIG_PPC64
 /* use 0x800000yy when locked, where yy == CPU number */
@@ -55,33 +67,59 @@ 
 #endif
 
 /*
- * This returns the old value in the lock, so we succeeded
- * in getting the lock if the return value is 0.
+ * Our own cmpxchg, operating on spinlock_t's.  Returns 0 iff value
+ * read at lock was equal to "old" AND the cmpxchg succeeded
+ * uninterruptedly.
  */
-static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock)
+static __always_inline int __arch_spin_cmpxchg_eq(arch_spinlock_t *lock,
+						  __ticketpair_t old,
+						  __ticketpair_t new)
 {
-	unsigned long tmp, token;
+	register int retval = 1;
+	register __ticketpair_t tmp;
 
-	token = LOCK_TOKEN;
-	__asm__ __volatile__(
-"1:	" PPC_LWARX(%0,0,%2,1) "\n\
-	cmpwi		0,%0,0\n\
-	bne-		2f\n\
-	stwcx.		%1,0,%2\n\
-	bne-		1b\n"
-	PPC_ACQUIRE_BARRIER
+	__asm__ __volatile__ (
+"	li %0,1\n"		/* default to "fail" */
+	PPC_RELEASE_BARRIER
+"1:	lwarx	%2,0,%5		# __arch_spin_cmpxchg_eq\n"
+"	cmp	0,0,%3,%2\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0, "%5")
+"	stwcx.	%4,0,%5\n"
+"	bne-	1b\n"
+"	isync\n"
+"	li %0,0\n"
 "2:"
-	: "=&r" (tmp)
-	: "r" (token), "r" (&lock->slock)
-	: "cr0", "memory");
+	: "=&r" (retval), "+m" (*lock)
+	: "r" (tmp), "r" (old), "r" (new), "r" (lock)
+	: "cc", "memory");
 
-	return tmp;
+	return retval;
+}
+
+static __always_inline int __arch_spin_trylock(arch_spinlock_t *lock)
+{
+	arch_spinlock_t old, new;
+
+	old.tickets = ACCESS_ONCE(lock->tickets);
+	if (old.tickets.head != old.tickets.tail)
+		return 0;
+
+	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+
+	if (__arch_spin_cmpxchg_eq(lock, old.head_tail, new.head_tail))
+		return 0;
+
+#if defined(CONFIG_PPC_SPLPAR)
+	lock->holder = LOCK_TOKEN;
+#endif
+	return 1;
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
 	CLEAR_IO_SYNC;
-	return __arch_spin_trylock(lock) == 0;
+	return  __arch_spin_trylock(lock);
 }
 
 /*
@@ -93,9 +131,8 @@  static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * rest of our timeslice to the lock holder.
  *
  * So that we can tell which virtual processor is holding a lock,
- * we put 0x80000000 | smp_processor_id() in the lock when it is
- * held.  Conveniently, we have a word in the paca that holds this
- * value.
+ * we put 0x80000000 | smp_processor_id() into lock->holder.
+ * Conveniently, we have a word in the paca that holds this value.
  */
 
 #if defined(CONFIG_PPC_SPLPAR)
@@ -109,19 +146,55 @@  extern void __rw_yield(arch_rwlock_t *lock);
 #define SHARED_PROCESSOR	0
 #endif
 
-static inline void arch_spin_lock(arch_spinlock_t *lock)
+/*
+ * Ticket locks are conceptually two parts, one indicating the current head of
+ * the queue, and the other indicating the current tail. The lock is acquired
+ * by atomically noting the tail and incrementing it by one (thus adding
+ * ourself to the queue and noting our position), then waiting until the head
+ * becomes equal to the the initial value of the tail.
+ *
+ * We use an asm covering *both* parts of the lock, to increment the tail and
+ * also load the position of the head, which takes care of memory ordering
+ * issues and should be optimal for the uncontended case. Note the tail must be
+ * in the high part, because a wide add increment of the low part would carry
+ * up and contaminate the high part.
+ */
+static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 {
+	register struct __raw_tickets old, tmp,
+		inc = { .tail = TICKET_LOCK_INC };
+
 	CLEAR_IO_SYNC;
-	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
-			break;
+	__asm__ __volatile__(
+"1:	lwarx	%0,0,%4		# arch_spin_lock\n"
+"	add	%1,%3,%0\n"
+	PPC405_ERR77(0, "%4")
+"	stwcx.	%1,0,%4\n"
+"	bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "+m" (lock->tickets)
+	: "r" (inc), "r" (&lock->tickets)
+	: "cc");
+
+	if (likely(old.head == old.tail))
+		goto out;
+
+	for (;;) {
+		unsigned count = 100;
+
 		do {
+			if (ACCESS_ONCE(lock->tickets.head) == old.tail)
+				goto out;
 			HMT_low();
 			if (SHARED_PROCESSOR)
 				__spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
+		} while (--count);
 		HMT_medium();
 	}
+out:
+#if defined(CONFIG_PPC_SPLPAR)
+	lock->holder = LOCK_TOKEN;
+#endif
+	barrier();	/* make sure nothing creeps before the lock is taken */
 }
 
 static inline
@@ -131,7 +204,7 @@  void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 
 	CLEAR_IO_SYNC;
 	while (1) {
-		if (likely(__arch_spin_trylock(lock) == 0))
+		if (likely(__arch_spin_trylock(lock)))
 			break;
 		local_save_flags(flags_dis);
 		local_irq_restore(flags);
@@ -139,7 +212,7 @@  void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 			HMT_low();
 			if (SHARED_PROCESSOR)
 				__spin_yield(lock);
-		} while (unlikely(lock->slock != 0));
+		} while (arch_spin_is_locked(lock));
 		HMT_medium();
 		local_irq_restore(flags_dis);
 	}
@@ -147,10 +220,21 @@  void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
+	arch_spinlock_t old, new;
+
+#if defined(CONFIG_PPC_SPLPAR)
+	lock->holder = 0;
+#endif
+	do {
+		old.tickets = ACCESS_ONCE(lock->tickets);
+		new.tickets.head = old.tickets.head + TICKET_LOCK_INC;
+		new.tickets.tail = old.tickets.tail;
+	} while (unlikely(__arch_spin_cmpxchg_eq(lock,
+						 old.head_tail,
+						 new.head_tail)));
 	SYNC_IO;
 	__asm__ __volatile__("# arch_spin_unlock\n\t"
 				PPC_RELEASE_BARRIER: : :"memory");
-	lock->slock = 0;
 }
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c
index 0c9c8d7..4a57e32 100644
--- a/arch/powerpc/lib/locks.c
+++ b/arch/powerpc/lib/locks.c
@@ -27,7 +27,7 @@  void __spin_yield(arch_spinlock_t *lock)
 {
 	unsigned int lock_value, holder_cpu, yield_count;
 
-	lock_value = lock->slock;
+	lock_value = lock->holder;
 	if (lock_value == 0)
 		return;
 	holder_cpu = lock_value & 0xffff;
@@ -36,7 +36,7 @@  void __spin_yield(arch_spinlock_t *lock)
 	if ((yield_count & 1) == 0)
 		return;		/* virtual cpu is currently running */
 	rmb();
-	if (lock->slock != lock_value)
+	if (lock->holder != lock_value)
 		return;		/* something has changed */
 	plpar_hcall_norets(H_CONFER,
 		get_hard_smp_processor_id(holder_cpu), yield_count);
@@ -70,7 +70,7 @@  void __rw_yield(arch_rwlock_t *rw)
 
 void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	while (lock->slock) {
+	while (arch_spin_is_locked(lock)) {
 		HMT_low();
 		if (SHARED_PROCESSOR)
 			__spin_yield(lock);