diff mbox

[-v3,7/8] locking: Move smp_cond_load_acquire() and friends into asm-generic/barrier.h

Message ID 20160601093158.GN3190@twins.programming.kicks-ass.net
State Awaiting Upstream
Delegated to: Pablo Neira
Headers show

Commit Message

Peter Zijlstra June 1, 2016, 9:31 a.m. UTC
On Tue, May 31, 2016 at 04:01:06PM -0400, Waiman Long wrote:
> You are doing two READ_ONCE's in the smp_cond_load_acquire loop. Can we
> change it to do just one READ_ONCE, like
> 
> --- a/include/asm-generic/barrier.h
> +++ b/include/asm-generic/barrier.h
> @@ -229,12 +229,18 @@ do {
>   * value; some architectures can do this in hardware.
>   */
>  #ifndef cmpwait
> +#define cmpwait(ptr, val) ({                                   \
>         typeof (ptr) __ptr = (ptr);                             \
> +       typeof (val) __old = (val);                             \
> +       typeof (val) __new;                                     \
> +       for (;;) {                                              \
> +               __new = READ_ONCE(*__ptr);                      \
> +               if (__new != __old)                             \
> +                       break;                                  \
>                 cpu_relax();                                    \
> +       }                                                       \
> +       __new;                                                  \
> +})
>  #endif
> 
>  /**
> @@ -251,12 +257,11 @@ do {
>  #ifndef smp_cond_load_acquire
>  #define smp_cond_load_acquire(ptr, cond_expr) ({               \
>         typeof(ptr) __PTR = (ptr);                              \
> +       typeof(*ptr) VAL = READ_ONCE(*__PTR);                   \
>         for (;;) {                                              \
>                 if (cond_expr)                                  \
>                         break;                                  \
> +               VAL = cmpwait(__PTR, VAL);                      \
>         }                                                       \
>         smp_acquire__after_ctrl_dep();                          \
>         VAL;                                                    \

Yes, that generates slightly better code, but now that you made me look
at it, I think we need to kill the cmpwait() in the generic version and
only keep it for arch versions.

/me ponders...

So cmpwait() as implemented here has strict semantics; but arch
implementations as previously proposed have less strict semantics; and
the use here follows that less strict variant.

The difference being that the arch implementations of cmpwait can have
false positives (ie. return early, without a changed value)
smp_cond_load_acquire() can deal with these false positives seeing how
its in a loop and does its own (more specific) comparison.

Exposing cmpwait(), with the documented semantics, means that arch
versions need an additional loop inside to match these strict semantics,
or we need to weaken the cmpwait() semantics, at which point I'm not
entirely sure its worth keeping as a generic primitive...

Hmm, so if we can find a use for the weaker cmpwait() outside of
smp_cond_load_acquire() I think we can make a case for keeping it, and
looking at qspinlock.h there's two sites we can replace cpu_relax() with
it.

Will, since ARM64 seems to want to use this, does the below make sense
to you?

---
 include/asm-generic/barrier.h | 15 ++++++---------
 kernel/locking/qspinlock.c    |  4 ++--
 2 files changed, 8 insertions(+), 11 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Will Deacon June 1, 2016, noon UTC | #1
On Wed, Jun 01, 2016 at 11:31:58AM +0200, Peter Zijlstra wrote:
> On Tue, May 31, 2016 at 04:01:06PM -0400, Waiman Long wrote:
> > You are doing two READ_ONCE's in the smp_cond_load_acquire loop. Can we
> > change it to do just one READ_ONCE, like
> > 
> > --- a/include/asm-generic/barrier.h
> > +++ b/include/asm-generic/barrier.h
> > @@ -229,12 +229,18 @@ do {
> >   * value; some architectures can do this in hardware.
> >   */
> >  #ifndef cmpwait
> > +#define cmpwait(ptr, val) ({                                   \
> >         typeof (ptr) __ptr = (ptr);                             \
> > +       typeof (val) __old = (val);                             \
> > +       typeof (val) __new;                                     \
> > +       for (;;) {                                              \
> > +               __new = READ_ONCE(*__ptr);                      \
> > +               if (__new != __old)                             \
> > +                       break;                                  \
> >                 cpu_relax();                                    \
> > +       }                                                       \
> > +       __new;                                                  \
> > +})
> >  #endif
> > 
> >  /**
> > @@ -251,12 +257,11 @@ do {
> >  #ifndef smp_cond_load_acquire
> >  #define smp_cond_load_acquire(ptr, cond_expr) ({               \
> >         typeof(ptr) __PTR = (ptr);                              \
> > +       typeof(*ptr) VAL = READ_ONCE(*__PTR);                   \
> >         for (;;) {                                              \
> >                 if (cond_expr)                                  \
> >                         break;                                  \
> > +               VAL = cmpwait(__PTR, VAL);                      \
> >         }                                                       \
> >         smp_acquire__after_ctrl_dep();                          \
> >         VAL;                                                    \
> 
> Yes, that generates slightly better code, but now that you made me look
> at it, I think we need to kill the cmpwait() in the generic version and
> only keep it for arch versions.
> 
> /me ponders...
> 
> So cmpwait() as implemented here has strict semantics; but arch
> implementations as previously proposed have less strict semantics; and
> the use here follows that less strict variant.
> 
> The difference being that the arch implementations of cmpwait can have
> false positives (ie. return early, without a changed value)
> smp_cond_load_acquire() can deal with these false positives seeing how
> its in a loop and does its own (more specific) comparison.
> 
> Exposing cmpwait(), with the documented semantics, means that arch
> versions need an additional loop inside to match these strict semantics,
> or we need to weaken the cmpwait() semantics, at which point I'm not
> entirely sure its worth keeping as a generic primitive...
> 
> Hmm, so if we can find a use for the weaker cmpwait() outside of
> smp_cond_load_acquire() I think we can make a case for keeping it, and
> looking at qspinlock.h there's two sites we can replace cpu_relax() with
> it.
> 
> Will, since ARM64 seems to want to use this, does the below make sense
> to you?

Not especially -- I was going to override smp_cond_load_acquire anyway
because I want to build it using cmpwait_acquire and get rid of the
smp_acquire__after_ctrl_dep trick, which is likely slower on arm64.

So I'd be happier nuking cmpwait from the generic interfaces and using
smp_cond_load_acquire everywhere, if that's possible.

Will
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra June 1, 2016, 12:06 p.m. UTC | #2
On Wed, Jun 01, 2016 at 01:00:10PM +0100, Will Deacon wrote:
> On Wed, Jun 01, 2016 at 11:31:58AM +0200, Peter Zijlstra wrote:
> > Will, since ARM64 seems to want to use this, does the below make sense
> > to you?
> 
> Not especially -- I was going to override smp_cond_load_acquire anyway
> because I want to build it using cmpwait_acquire and get rid of the
> smp_acquire__after_ctrl_dep trick, which is likely slower on arm64.
> 
> So I'd be happier nuking cmpwait from the generic interfaces and using
> smp_cond_load_acquire everywhere, if that's possible.

Works for me; but that would loose using cmpwait() for
!smp_cond_load_acquire() spins, you fine with that?

The two conversions in the patch were both !acquire spins.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon June 1, 2016, 12:13 p.m. UTC | #3
On Wed, Jun 01, 2016 at 02:06:54PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 01, 2016 at 01:00:10PM +0100, Will Deacon wrote:
> > On Wed, Jun 01, 2016 at 11:31:58AM +0200, Peter Zijlstra wrote:
> > > Will, since ARM64 seems to want to use this, does the below make sense
> > > to you?
> > 
> > Not especially -- I was going to override smp_cond_load_acquire anyway
> > because I want to build it using cmpwait_acquire and get rid of the
> > smp_acquire__after_ctrl_dep trick, which is likely slower on arm64.
> > 
> > So I'd be happier nuking cmpwait from the generic interfaces and using
> > smp_cond_load_acquire everywhere, if that's possible.
> 
> Works for me; but that would loose using cmpwait() for
> !smp_cond_load_acquire() spins, you fine with that?
> 
> The two conversions in the patch were both !acquire spins.

Maybe we could go the whole hog and add smp_cond_load_relaxed?

Will
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra June 1, 2016, 12:45 p.m. UTC | #4
On Wed, Jun 01, 2016 at 01:13:33PM +0100, Will Deacon wrote:
> On Wed, Jun 01, 2016 at 02:06:54PM +0200, Peter Zijlstra wrote:

> > Works for me; but that would loose using cmpwait() for
> > !smp_cond_load_acquire() spins, you fine with that?
> > 
> > The two conversions in the patch were both !acquire spins.
> 
> Maybe we could go the whole hog and add smp_cond_load_relaxed?

What about say the cmpxchg loops in queued_write_lock_slowpath()
? Would that be something you'd like to use wfe for?

Writing those in smp_cond_load_{acquire,relaxed)() is somewhat possible
but quite ugleh.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon June 1, 2016, 2:07 p.m. UTC | #5
On Wed, Jun 01, 2016 at 02:45:41PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 01, 2016 at 01:13:33PM +0100, Will Deacon wrote:
> > On Wed, Jun 01, 2016 at 02:06:54PM +0200, Peter Zijlstra wrote:
> 
> > > Works for me; but that would loose using cmpwait() for
> > > !smp_cond_load_acquire() spins, you fine with that?
> > > 
> > > The two conversions in the patch were both !acquire spins.
> > 
> > Maybe we could go the whole hog and add smp_cond_load_relaxed?
> 
> What about say the cmpxchg loops in queued_write_lock_slowpath()
> ? Would that be something you'd like to use wfe for?

Without actually running the code on real hardware, it's hard to say
for sure. I notice that those loops are using cpu_relax_lowlatency
at present and we *know* that we're next in the queue (i.e. we're just
waiting for existing readers to drain), so the benefit of wfe is somewhat
questionable here and I don't think we'd want to add that initially.

Will
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Waiman Long June 1, 2016, 4:53 p.m. UTC | #6
On 06/01/2016 05:31 AM, Peter Zijlstra wrote:
> On Tue, May 31, 2016 at 04:01:06PM -0400, Waiman Long wrote:
>> You are doing two READ_ONCE's in the smp_cond_load_acquire loop. Can we
>> change it to do just one READ_ONCE, like
>>
>> --- a/include/asm-generic/barrier.h
>> +++ b/include/asm-generic/barrier.h
>> @@ -229,12 +229,18 @@ do {
>>    * value; some architectures can do this in hardware.
>>    */
>>   #ifndef cmpwait
>> +#define cmpwait(ptr, val) ({                                   \
>>          typeof (ptr) __ptr = (ptr);                             \
>> +       typeof (val) __old = (val);                             \
>> +       typeof (val) __new;                                     \
>> +       for (;;) {                                              \
>> +               __new = READ_ONCE(*__ptr);                      \
>> +               if (__new != __old)                             \
>> +                       break;                                  \
>>                  cpu_relax();                                    \
>> +       }                                                       \
>> +       __new;                                                  \
>> +})
>>   #endif
>>
>>   /**
>> @@ -251,12 +257,11 @@ do {
>>   #ifndef smp_cond_load_acquire
>>   #define smp_cond_load_acquire(ptr, cond_expr) ({               \
>>          typeof(ptr) __PTR = (ptr);                              \
>> +       typeof(*ptr) VAL = READ_ONCE(*__PTR);                   \
>>          for (;;) {                                              \
>>                  if (cond_expr)                                  \
>>                          break;                                  \
>> +               VAL = cmpwait(__PTR, VAL);                      \
>>          }                                                       \
>>          smp_acquire__after_ctrl_dep();                          \
>>          VAL;                                                    \
> Yes, that generates slightly better code, but now that you made me look
> at it, I think we need to kill the cmpwait() in the generic version and
> only keep it for arch versions.
>
> /me ponders...
>
> So cmpwait() as implemented here has strict semantics; but arch
> implementations as previously proposed have less strict semantics; and
> the use here follows that less strict variant.
>
> The difference being that the arch implementations of cmpwait can have
> false positives (ie. return early, without a changed value)
> smp_cond_load_acquire() can deal with these false positives seeing how
> its in a loop and does its own (more specific) comparison.
>
> Exposing cmpwait(), with the documented semantics, means that arch
> versions need an additional loop inside to match these strict semantics,
> or we need to weaken the cmpwait() semantics, at which point I'm not
> entirely sure its worth keeping as a generic primitive...
>
> Hmm, so if we can find a use for the weaker cmpwait() outside of
> smp_cond_load_acquire() I think we can make a case for keeping it, and
> looking at qspinlock.h there's two sites we can replace cpu_relax() with
> it.
>
> Will, since ARM64 seems to want to use this, does the below make sense
> to you?
>
> ---
>   include/asm-generic/barrier.h | 15 ++++++---------
>   kernel/locking/qspinlock.c    |  4 ++--
>   2 files changed, 8 insertions(+), 11 deletions(-)
>
> diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
> index be9222b10d17..05feda5c22e6 100644
> --- a/include/asm-generic/barrier.h
> +++ b/include/asm-generic/barrier.h
> @@ -221,20 +221,17 @@ do {									\
>   #endif
>
>   /**
> - * cmpwait - compare and wait for a variable to change
> + * cmpwait - compare and wait for a variable to 'change'
>    * @ptr: pointer to the variable to wait on
>    * @val: the value it should change from
>    *
> - * A simple constuct that waits for a variable to change from a known
> - * value; some architectures can do this in hardware.
> + * A 'better' cpu_relax(), some architectures can avoid polling and have event
> + * based wakeups on variables. Such constructs allow false positives on the
> + * 'change' and can return early. Therefore this reduces to cpu_relax()
> + * without hardware assist.
>    */
>   #ifndef cmpwait
> -#define cmpwait(ptr, val) do {					\
> -	typeof (ptr) __ptr = (ptr);				\
> -	typeof (val) __val = (val);				\
> -	while (READ_ONCE(*__ptr) == __val)			\
> -		cpu_relax();					\
> -} while (0)
> +#define cmpwait(ptr, val)	cpu_relax()
>   #endif
>
>   /**
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index e98e5bf679e9..60a811d56406 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -311,7 +311,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>   	 */
>   	if (val == _Q_PENDING_VAL) {
>   		while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
> -			cpu_relax();
> +			cmpwait(&lock->val.counter, _Q_PENDING_VAL);
>   	}
>
>   	/*
> @@ -481,7 +481,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
>   	 */
>   	if (!next) {
>   		while (!(next = READ_ONCE(node->next)))
> -			cpu_relax();
> +			cmpwait(&node->next, NULL);
>   	}
>
>   	arch_mcs_spin_unlock_contended(&next->locked);

I think it is a good idea to consider cmpwait as a fancier version of 
cpu_relax(). It can certainly get used in a lot more places.

Cheers,
Longman
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra June 1, 2016, 5:13 p.m. UTC | #7
On Wed, Jun 01, 2016 at 03:07:14PM +0100, Will Deacon wrote:
> On Wed, Jun 01, 2016 at 02:45:41PM +0200, Peter Zijlstra wrote:
> > On Wed, Jun 01, 2016 at 01:13:33PM +0100, Will Deacon wrote:
> > > On Wed, Jun 01, 2016 at 02:06:54PM +0200, Peter Zijlstra wrote:
> > 
> > > > Works for me; but that would loose using cmpwait() for
> > > > !smp_cond_load_acquire() spins, you fine with that?
> > > > 
> > > > The two conversions in the patch were both !acquire spins.
> > > 
> > > Maybe we could go the whole hog and add smp_cond_load_relaxed?
> > 
> > What about say the cmpxchg loops in queued_write_lock_slowpath()
> > ? Would that be something you'd like to use wfe for?
> 
> Without actually running the code on real hardware, it's hard to say
> for sure. I notice that those loops are using cpu_relax_lowlatency
> at present and we *know* that we're next in the queue (i.e. we're just
> waiting for existing readers to drain), so the benefit of wfe is somewhat
> questionable here and I don't think we'd want to add that initially.

OK, we can always change our minds anyway. OK I'll respin/fold/massage
the series to make it go away.

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index be9222b10d17..05feda5c22e6 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -221,20 +221,17 @@  do {									\
 #endif
 
 /**
- * cmpwait - compare and wait for a variable to change
+ * cmpwait - compare and wait for a variable to 'change'
  * @ptr: pointer to the variable to wait on
  * @val: the value it should change from
  *
- * A simple constuct that waits for a variable to change from a known
- * value; some architectures can do this in hardware.
+ * A 'better' cpu_relax(), some architectures can avoid polling and have event
+ * based wakeups on variables. Such constructs allow false positives on the
+ * 'change' and can return early. Therefore this reduces to cpu_relax()
+ * without hardware assist.
  */
 #ifndef cmpwait
-#define cmpwait(ptr, val) do {					\
-	typeof (ptr) __ptr = (ptr);				\
-	typeof (val) __val = (val);				\
-	while (READ_ONCE(*__ptr) == __val)			\
-		cpu_relax();					\
-} while (0)
+#define cmpwait(ptr, val)	cpu_relax()
 #endif
 
 /**
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index e98e5bf679e9..60a811d56406 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -311,7 +311,7 @@  void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	 */
 	if (val == _Q_PENDING_VAL) {
 		while ((val = atomic_read(&lock->val)) == _Q_PENDING_VAL)
-			cpu_relax();
+			cmpwait(&lock->val.counter, _Q_PENDING_VAL);
 	}
 
 	/*
@@ -481,7 +481,7 @@  void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	 */
 	if (!next) {
 		while (!(next = READ_ONCE(node->next)))
-			cpu_relax();
+			cmpwait(&node->next, NULL);
 	}
 
 	arch_mcs_spin_unlock_contended(&next->locked);