diff mbox

[-v4,5/7] locking, arch: Update spin_unlock_wait()

Message ID 20160602115439.085385545@infradead.org
State Accepted
Delegated to: Pablo Neira
Headers show

Commit Message

Peter Zijlstra June 2, 2016, 11:52 a.m. UTC
This patch updates/fixes all spin_unlock_wait() implementations.

The update is in semantics; where it previously was only a control
dependency, we now upgrade to a full load-acquire to match the
store-release from the spin_unlock() we waited on. This ensures that
when spin_unlock_wait() returns, we're guaranteed to observe the full
critical section we waited on.

This fixes a number of spin_unlock_wait() users that (not
unreasonably) rely on this.

I also fixed a number of ticket lock versions to only wait on the
current lock holder, instead of for a full unlock, as this is
sufficient.

Furthermore; again for ticket locks; I added an smp_rmb() in between
the initial ticket load and the spin loop testing the current value
because I could not convince myself the address dependency is
sufficient, esp. if the loads are of different sizes.

I'm more than happy to remove this smp_rmb() again if people are
certain the address dependency does indeed work as expected.

Cc: jejb@parisc-linux.org
Cc: davem@davemloft.net
Cc: chris@zankel.net
Cc: rth@twiddle.net
Cc: dhowells@redhat.com
Cc: schwidefsky@de.ibm.com
Cc: mpe@ellerman.id.au
Cc: ralf@linux-mips.org
Cc: linux@armlinux.org.uk
Cc: rkuo@codeaurora.org
Cc: vgupta@synopsys.com
Cc: james.hogan@imgtec.com
Cc: realmz6@gmail.com
Cc: ysato@users.sourceforge.jp
Cc: tony.luck@intel.com
Cc: cmetcalf@mellanox.com
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/alpha/include/asm/spinlock.h    |    9 +++++++--
 arch/arc/include/asm/spinlock.h      |    7 +++++--
 arch/arm/include/asm/spinlock.h      |   19 +++++++++++++++++--
 arch/blackfin/include/asm/spinlock.h |    5 +++--
 arch/hexagon/include/asm/spinlock.h  |   10 ++++++++--
 arch/ia64/include/asm/spinlock.h     |    4 ++++
 arch/m32r/include/asm/spinlock.h     |    9 +++++++--
 arch/metag/include/asm/spinlock.h    |   14 ++++++++++++--
 arch/mips/include/asm/spinlock.h     |   19 +++++++++++++++++--
 arch/mn10300/include/asm/spinlock.h  |    8 +++++++-
 arch/parisc/include/asm/spinlock.h   |    9 +++++++--
 arch/powerpc/include/asm/spinlock.h  |    8 ++++++--
 arch/s390/include/asm/spinlock.h     |    3 +++
 arch/sh/include/asm/spinlock.h       |   10 ++++++++--
 arch/sparc/include/asm/spinlock_32.h |    7 +++++--
 arch/sparc/include/asm/spinlock_64.h |   10 +++++++---
 arch/tile/lib/spinlock_32.c          |    6 ++++++
 arch/tile/lib/spinlock_64.c          |    6 ++++++
 arch/xtensa/include/asm/spinlock.h   |   10 ++++++++--
 include/asm-generic/barrier.h        |    2 +-
 include/asm-generic/qspinlock.h      |    5 +++--
 include/linux/spinlock_up.h          |   10 +++++++---
 22 files changed, 154 insertions(+), 36 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Boqun Feng June 2, 2016, 2:24 p.m. UTC | #1
On Thu, Jun 02, 2016 at 01:52:02PM +0200, Peter Zijlstra wrote:
[snip]
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -27,6 +27,8 @@
>  #include <asm/asm-compat.h>
>  #include <asm/synch.h>
>  #include <asm/ppc-opcode.h>
> +#include <asm/barrier.h>
> +#include <asm/processor.h>
>  
>  #ifdef CONFIG_PPC64
>  /* use 0x800000yy when locked, where yy == CPU number */
> @@ -165,8 +167,10 @@ static inline void arch_spin_unlock(arch
>  #ifdef CONFIG_PPC64
>  extern void arch_spin_unlock_wait(arch_spinlock_t *lock);
>  #else
> -#define arch_spin_unlock_wait(lock) \
> -	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
> +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
> +{
> +	smp_cond_load_acquire(&lock->slock, !VAL);
> +}
>  #endif
>  

About spin_unlock_wait() on ppc, I actually have a fix pending review:

http://lkml.kernel.org/r/1461130033-70898-1-git-send-email-boqun.feng@gmail.com

that patch fixed a different problem when people want to pair a
spin_unlock_wait() with a spin_lock().

I think we still need that fix, and there are two conflicts with this
series:

1.	arch_spin_unlock_wait() code for PPC32 was deleted, and
	consolidated into one.

2.	I actually downgraded spin_unlock_wait() to !ACQUIRE ;-)


I can think of two ways to solve thoes conflicts:

1.	Modify my patch to make spin_unlock_wait() an ACQUIRE, and it
	can be merged in powerpc tree, and possible go into to mainline
	before 4.7. Then there is no need for this series to have code
	for ppc, therefore no conflict.

or

2.	I can rebase my patch on this series, and it can be added in
	this series, and will go into mainline at 4.8.


Michael and Peter, any thought?

Regards,
Boqun

>  /*
Peter Zijlstra June 2, 2016, 2:44 p.m. UTC | #2
On Thu, Jun 02, 2016 at 10:24:40PM +0800, Boqun Feng wrote:
> On Thu, Jun 02, 2016 at 01:52:02PM +0200, Peter Zijlstra wrote:
> About spin_unlock_wait() on ppc, I actually have a fix pending review:
> 
> http://lkml.kernel.org/r/1461130033-70898-1-git-send-email-boqun.feng@gmail.com

Please use the normal commit quoting style:

  d86b8da04dfa ("arm64: spinlock: serialise spin_unlock_wait against concurrent lockers")

> that patch fixed a different problem when people want to pair a
> spin_unlock_wait() with a spin_lock().

Argh, indeed, and I think qspinlock is still broken there :/ But my poor
brain is about to give in for the day.

Let me go ponder that some :/

> I think we still need that fix, and there are two conflicts with this
> series:
> 
> 1.	arch_spin_unlock_wait() code for PPC32 was deleted, and
> 	consolidated into one.

Nice.

> 2.	I actually downgraded spin_unlock_wait() to !ACQUIRE ;-)

Fail ;-)

> I can think of two ways to solve thoes conflicts:
> 
> 1.	Modify my patch to make spin_unlock_wait() an ACQUIRE, and it
> 	can be merged in powerpc tree, and possible go into to mainline
> 	before 4.7. Then there is no need for this series to have code
> 	for ppc, therefore no conflict.

Hardly any other unlock_wait is an acquire, everyone is 'broken' :-/

> or
> 
> 2.	I can rebase my patch on this series, and it can be added in
> 	this series, and will go into mainline at 4.8.
> 
> 
> Michael and Peter, any thought?

I'm fine with it going in early, I can rebase, no problem.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Boqun Feng June 2, 2016, 3:11 p.m. UTC | #3
On Thu, Jun 02, 2016 at 04:44:24PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 02, 2016 at 10:24:40PM +0800, Boqun Feng wrote:
> > On Thu, Jun 02, 2016 at 01:52:02PM +0200, Peter Zijlstra wrote:
> > About spin_unlock_wait() on ppc, I actually have a fix pending review:
> > 
> > http://lkml.kernel.org/r/1461130033-70898-1-git-send-email-boqun.feng@gmail.com
> 
> Please use the normal commit quoting style:
> 
>   d86b8da04dfa ("arm64: spinlock: serialise spin_unlock_wait against concurrent lockers")
> 

Good point ;-)

> > that patch fixed a different problem when people want to pair a
> > spin_unlock_wait() with a spin_lock().
> 
> Argh, indeed, and I think qspinlock is still broken there :/ But my poor
> brain is about to give in for the day.
> 
> Let me go ponder that some :/
> 

An intial thought of the fix is making queued_spin_unlock_wait() an
atomic-nop too:

static inline void queued_spin_unlock_wait(struct qspinlock *lock)
{
	struct __qspinlock *l = (struct __qspinlock *)lock;
	
	while (!cmpxchg(&l->locked, 0, 0))
		cpu_relax();
}

This could make queued_spin_unlock_wait() a WRITE, with a smp_mb()
preceding it, it would act like a RELEASE, which can be paired with
spin_lock().

Just food for thought. ;-)

> > I think we still need that fix, and there are two conflicts with this
> > series:
> > 
> > 1.	arch_spin_unlock_wait() code for PPC32 was deleted, and
> > 	consolidated into one.
> 
> Nice.
> 
> > 2.	I actually downgraded spin_unlock_wait() to !ACQUIRE ;-)
> 
> Fail ;-)
> 
> > I can think of two ways to solve thoes conflicts:
> > 
> > 1.	Modify my patch to make spin_unlock_wait() an ACQUIRE, and it
> > 	can be merged in powerpc tree, and possible go into to mainline
> > 	before 4.7. Then there is no need for this series to have code
> > 	for ppc, therefore no conflict.
> 
> Hardly any other unlock_wait is an acquire, everyone is 'broken' :-/
> 
> > or
> > 
> > 2.	I can rebase my patch on this series, and it can be added in
> > 	this series, and will go into mainline at 4.8.
> > 
> > 
> > Michael and Peter, any thought?
> 
> I'm fine with it going in early, I can rebase, no problem.

OK, I will resend a new patch making spin_unlock_wait() align the
semantics in your series.

Regards,
Boqun
Boqun Feng June 2, 2016, 3:57 p.m. UTC | #4
On Thu, Jun 02, 2016 at 11:11:07PM +0800, Boqun Feng wrote:
[snip]
> 
> OK, I will resend a new patch making spin_unlock_wait() align the
> semantics in your series.
> 

I realize that if my patch goes first then it's more safe and convenient
to keep the two smp_mb()s in ppc arch_spin_unlock_wait(). I will only do
fix and clean up in my patch and leave the semantics changing part to
you ;-)

> Regards,
> Boqun
Peter Zijlstra June 2, 2016, 4:04 p.m. UTC | #5
On Thu, Jun 02, 2016 at 11:11:07PM +0800, Boqun Feng wrote:
> On Thu, Jun 02, 2016 at 04:44:24PM +0200, Peter Zijlstra wrote:
> > Let me go ponder that some :/
> > 
> 
> An intial thought of the fix is making queued_spin_unlock_wait() an
> atomic-nop too:
> 
> static inline void queued_spin_unlock_wait(struct qspinlock *lock)
> {
> 	struct __qspinlock *l = (struct __qspinlock *)lock;
> 	
> 	while (!cmpxchg(&l->locked, 0, 0))
> 		cpu_relax();
> }
> 
> This could make queued_spin_unlock_wait() a WRITE, with a smp_mb()
> preceding it, it would act like a RELEASE, which can be paired with
> spin_lock().
> 
> Just food for thought. ;-)

Not sure that'll actually work. The qspinlock store is completely
unordered and not part of a ll/sc or anything like that.

Doing competing stores might even result in loosing it entirely.

But I think I got something.. Lemme go test it :-)


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -3,6 +3,8 @@ 
 
 #include <linux/kernel.h>
 #include <asm/current.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -13,8 +15,11 @@ 
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 #define arch_spin_is_locked(x)	((x)->lock != 0)
-#define arch_spin_unlock_wait(x) \
-		do { cpu_relax(); } while ((x)->lock)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->lock, !VAL);
+}
 
 static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -15,8 +15,11 @@ 
 
 #define arch_spin_is_locked(x)	((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__)
 #define arch_spin_lock_flags(lock, flags)	arch_spin_lock(lock)
-#define arch_spin_unlock_wait(x) \
-	do { while (arch_spin_is_locked(x)) cpu_relax(); } while (0)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->slock, !VAL);
+}
 
 #ifdef CONFIG_ARC_HAS_LLSC
 
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -6,6 +6,8 @@ 
 #endif
 
 #include <linux/prefetch.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 
 /*
  * sev and wfe are ARMv6K extensions.  Uniprocessor ARMv6 may not have the K
@@ -50,8 +52,21 @@  static inline void dsb_sev(void)
  * memory.
  */
 
-#define arch_spin_unlock_wait(lock) \
-	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	u16 owner = READ_ONCE(lock->tickets.owner);
+
+	for (;;) {
+		arch_spinlock_t tmp = READ_ONCE(*lock);
+
+		if (tmp.tickets.owner == tmp.tickets.next ||
+		    tmp.tickets.owner != owner)
+			break;
+
+		wfe();
+	}
+	smp_acquire__after_ctrl_dep();
+}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
--- a/arch/blackfin/include/asm/spinlock.h
+++ b/arch/blackfin/include/asm/spinlock.h
@@ -12,6 +12,8 @@ 
 #else
 
 #include <linux/atomic.h>
+#include <asm/processor.h>
+#include <asm/barrier.h>
 
 asmlinkage int __raw_spin_is_locked_asm(volatile int *ptr);
 asmlinkage void __raw_spin_lock_asm(volatile int *ptr);
@@ -48,8 +50,7 @@  static inline void arch_spin_unlock(arch
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	while (arch_spin_is_locked(lock))
-		cpu_relax();
+	smp_cond_load_acquire(&lock->lock, !VAL);
 }
 
 static inline int arch_read_can_lock(arch_rwlock_t *rw)
--- a/arch/hexagon/include/asm/spinlock.h
+++ b/arch/hexagon/include/asm/spinlock.h
@@ -23,6 +23,8 @@ 
 #define _ASM_SPINLOCK_H
 
 #include <asm/irqflags.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 
 /*
  * This file is pulled in for SMP builds.
@@ -176,8 +178,12 @@  static inline unsigned int arch_spin_try
  * SMP spinlocks are intended to allow only a single CPU at the lock
  */
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(lock) \
-	do {while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->lock, !VAL);
+}
+
 #define arch_spin_is_locked(x) ((x)->lock != 0)
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -15,6 +15,8 @@ 
 
 #include <linux/atomic.h>
 #include <asm/intrinsics.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 
 #define arch_spin_lock_init(x)			((x)->lock = 0)
 
@@ -86,6 +88,8 @@  static __always_inline void __ticket_spi
 			return;
 		cpu_relax();
 	}
+
+	smp_acquire__after_ctrl_dep();
 }
 
 static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
--- a/arch/m32r/include/asm/spinlock.h
+++ b/arch/m32r/include/asm/spinlock.h
@@ -13,6 +13,8 @@ 
 #include <linux/atomic.h>
 #include <asm/dcache_clear.h>
 #include <asm/page.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
@@ -27,8 +29,11 @@ 
 
 #define arch_spin_is_locked(x)		(*(volatile int *)(&(x)->slock) <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(x) \
-		do { cpu_relax(); } while (arch_spin_is_locked(x))
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->slock, VAL > 0);
+}
 
 /**
  * arch_spin_trylock - Try spin lock and return a result
--- a/arch/metag/include/asm/spinlock.h
+++ b/arch/metag/include/asm/spinlock.h
@@ -1,14 +1,24 @@ 
 #ifndef __ASM_SPINLOCK_H
 #define __ASM_SPINLOCK_H
 
+#include <asm/barrier.h>
+#include <asm/processor.h>
+
 #ifdef CONFIG_METAG_ATOMICITY_LOCK1
 #include <asm/spinlock_lock1.h>
 #else
 #include <asm/spinlock_lnkget.h>
 #endif
 
-#define arch_spin_unlock_wait(lock) \
-	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+/*
+ * both lock1 and lnkget are test-and-set spinlocks with 0 unlocked and 1
+ * locked.
+ */
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->lock, !VAL);
+}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -12,6 +12,7 @@ 
 #include <linux/compiler.h>
 
 #include <asm/barrier.h>
+#include <asm/processor.h>
 #include <asm/compiler.h>
 #include <asm/war.h>
 
@@ -48,8 +49,22 @@  static inline int arch_spin_value_unlock
 }
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(x) \
-	while (arch_spin_is_locked(x)) { cpu_relax(); }
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	u16 owner = READ_ONCE(lock->h.serving_now);
+	smp_rmb();
+	for (;;) {
+		arch_spinlock_t tmp = READ_ONCE(*lock);
+
+		if (tmp.h.serving_now == tmp.h.ticket ||
+		    tmp.h.serving_now != owner)
+			break;
+
+		cpu_relax();
+	}
+	smp_acquire__after_ctrl_dep();
+}
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
--- a/arch/mn10300/include/asm/spinlock.h
+++ b/arch/mn10300/include/asm/spinlock.h
@@ -12,6 +12,8 @@ 
 #define _ASM_SPINLOCK_H
 
 #include <linux/atomic.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 #include <asm/rwlock.h>
 #include <asm/page.h>
 
@@ -23,7 +25,11 @@ 
  */
 
 #define arch_spin_is_locked(x)	(*(volatile signed char *)(&(x)->slock) != 0)
-#define arch_spin_unlock_wait(x) do { barrier(); } while (arch_spin_is_locked(x))
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->slock, !VAL);
+}
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -13,8 +13,13 @@  static inline int arch_spin_is_locked(ar
 }
 
 #define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0)
-#define arch_spin_unlock_wait(x) \
-		do { cpu_relax(); } while (arch_spin_is_locked(x))
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *x)
+{
+	volatile unsigned int *a = __ldcw_align(x);
+
+	smp_cond_load_acquire(a, VAL);
+}
 
 static inline void arch_spin_lock_flags(arch_spinlock_t *x,
 					 unsigned long flags)
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -27,6 +27,8 @@ 
 #include <asm/asm-compat.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 
 #ifdef CONFIG_PPC64
 /* use 0x800000yy when locked, where yy == CPU number */
@@ -165,8 +167,10 @@  static inline void arch_spin_unlock(arch
 #ifdef CONFIG_PPC64
 extern void arch_spin_unlock_wait(arch_spinlock_t *lock);
 #else
-#define arch_spin_unlock_wait(lock) \
-	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->slock, !VAL);
+}
 #endif
 
 /*
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -10,6 +10,8 @@ 
 #define __ASM_SPINLOCK_H
 
 #include <linux/smp.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 
 #define SPINLOCK_LOCKVAL (S390_lowcore.spinlock_lockval)
 
@@ -97,6 +99,7 @@  static inline void arch_spin_unlock_wait
 {
 	while (arch_spin_is_locked(lock))
 		arch_spin_relax(lock);
+	smp_acquire__after_ctrl_dep();
 }
 
 /*
--- a/arch/sh/include/asm/spinlock.h
+++ b/arch/sh/include/asm/spinlock.h
@@ -19,14 +19,20 @@ 
 #error "Need movli.l/movco.l for spinlocks"
 #endif
 
+#include <asm/barrier.h>
+#include <asm/processor.h>
+
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  */
 
 #define arch_spin_is_locked(x)		((x)->lock <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(x) \
-	do { while (arch_spin_is_locked(x)) cpu_relax(); } while (0)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->lock, VAL > 0);
+}
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -9,12 +9,15 @@ 
 #ifndef __ASSEMBLY__
 
 #include <asm/psr.h>
+#include <asm/barrier.h>
 #include <asm/processor.h> /* for cpu_relax */
 
 #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0)
 
-#define arch_spin_unlock_wait(lock) \
-	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->lock, !VAL);
+}
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -8,6 +8,9 @@ 
 
 #ifndef __ASSEMBLY__
 
+#include <asm/processor.h>
+#include <asm/barrier.h>
+
 /* To get debugging spinlocks which detect and catch
  * deadlock situations, set CONFIG_DEBUG_SPINLOCK
  * and rebuild your kernel.
@@ -23,9 +26,10 @@ 
 
 #define arch_spin_is_locked(lp)	((lp)->lock != 0)
 
-#define arch_spin_unlock_wait(lp)	\
-	do {	rmb();			\
-	} while((lp)->lock)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->lock, !VAL);
+}
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -76,6 +76,12 @@  void arch_spin_unlock_wait(arch_spinlock
 	do {
 		delay_backoff(iterations++);
 	} while (READ_ONCE(lock->current_ticket) == curr);
+
+	/*
+	 * The TILE architecture doesn't do read speculation; therefore
+	 * a control dependency guarantees a LOAD->{LOAD,STORE} order.
+	 */
+	barrier();
 }
 EXPORT_SYMBOL(arch_spin_unlock_wait);
 
--- a/arch/tile/lib/spinlock_64.c
+++ b/arch/tile/lib/spinlock_64.c
@@ -76,6 +76,12 @@  void arch_spin_unlock_wait(arch_spinlock
 	do {
 		delay_backoff(iterations++);
 	} while (arch_spin_current(READ_ONCE(lock->lock)) == curr);
+
+	/*
+	 * The TILE architecture doesn't do read speculation; therefore
+	 * a control dependency guarantees a LOAD->{LOAD,STORE} order.
+	 */
+	barrier();
 }
 EXPORT_SYMBOL(arch_spin_unlock_wait);
 
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -11,6 +11,9 @@ 
 #ifndef _XTENSA_SPINLOCK_H
 #define _XTENSA_SPINLOCK_H
 
+#include <asm/barrier.h>
+#include <asm/processor.h>
+
 /*
  * spinlock
  *
@@ -29,8 +32,11 @@ 
  */
 
 #define arch_spin_is_locked(x) ((x)->slock != 0)
-#define arch_spin_unlock_wait(lock) \
-	do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->slock, !VAL);
+}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -194,7 +194,7 @@  do {									\
 })
 #endif
 
-#endif
+#endif	/* CONFIG_SMP */
 
 /* Barriers for virtual machine guests when talking to an SMP host */
 #define virt_mb() __smp_mb()
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -20,6 +20,8 @@ 
 #define __ASM_GENERIC_QSPINLOCK_H
 
 #include <asm-generic/qspinlock_types.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
 
 /**
  * queued_spin_is_locked - is the spinlock locked?
@@ -133,8 +135,7 @@  static inline void queued_spin_unlock_wa
 {
 	/* See queued_spin_is_locked() */
 	smp_mb();
-	while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
-		cpu_relax();
+	smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
 }
 
 #ifndef virt_spin_lock
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -6,6 +6,7 @@ 
 #endif
 
 #include <asm/processor.h>	/* for cpu_relax() */
+#include <asm/barrier.h>
 
 /*
  * include/linux/spinlock_up.h - UP-debug version of spinlocks.
@@ -25,6 +26,11 @@ 
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define arch_spin_is_locked(x)		((x)->slock == 0)
 
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+	smp_cond_load_acquire(&lock->slock, VAL);
+}
+
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	lock->slock = 0;
@@ -67,6 +73,7 @@  static inline void arch_spin_unlock(arch
 
 #else /* DEBUG_SPINLOCK */
 #define arch_spin_is_locked(lock)	((void)(lock), 0)
+#define arch_spin_unlock_wait(lock)	do { barrier(); (void)(lock); } while (0)
 /* for sched/core.c and kernel_lock.c: */
 # define arch_spin_lock(lock)		do { barrier(); (void)(lock); } while (0)
 # define arch_spin_lock_flags(lock, flags)	do { barrier(); (void)(lock); } while (0)
@@ -79,7 +86,4 @@  static inline void arch_spin_unlock(arch
 #define arch_read_can_lock(lock)	(((void)(lock), 1))
 #define arch_write_can_lock(lock)	(((void)(lock), 1))
 
-#define arch_spin_unlock_wait(lock) \
-		do { cpu_relax(); } while (arch_spin_is_locked(lock))
-
 #endif /* __LINUX_SPINLOCK_UP_H */