Message ID | 20211110001614.2087610-2-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | Optimize CAS [BZ #28537] | expand |
On Tue, Nov 9, 2021 at 6:21 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > CAS instruction is expensive. From the x86 CPU's point of view, getting > a cache line for writing is more expensive than reading. See Appendix > A.2 Spinlock in: > > https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf > > The full compare and swap will grab the cache line exclusive and cause > excessive cache line bouncing. > > 1. Change low level locks to do an atomic load and skip CAS if compare > may fail to reduce cache line bouncing on contended locks. > 2. In __lll_lock, replace atomic_compare_and_exchange_bool_acq with > atomic_compare_and_exchange_val_acq and pass down the result to > __lll_lock_wait and __lll_lock_wait_private to avoid the redundant load > there. > 3. Drop __glibc_unlikely in __lll_trylock and lll_cond_trylock since we > don't know if it's actually rare; in the contended case it is clearly not > rare. Think especially since you have a manual check for old vs new, the atomic CAS failure should still be wrapped in a `glibc_unlikely`. > --- > nptl/lowlevellock.c | 12 ++++++------ > sysdeps/nptl/lowlevellock.h | 29 ++++++++++++++++++++--------- > 2 files changed, 26 insertions(+), 15 deletions(-) > > diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c > index 8c740529c4..d1965c01ca 100644 > --- a/nptl/lowlevellock.c > +++ b/nptl/lowlevellock.c > @@ -22,30 +22,30 @@ > #include <stap-probe.h> > > void > -__lll_lock_wait_private (int *futex) > +__lll_lock_wait_private (int *futex, int futex_value) > { > - if (atomic_load_relaxed (futex) == 2) > + if (futex_value == 2) > goto futex; > > while (atomic_exchange_acquire (futex, 2) != 0) > { > futex: > - LIBC_PROBE (lll_lock_wait_private, 1, futex); > + LIBC_PROBE (lll_lock_wait_private, 2, futex, futex_value); > futex_wait ((unsigned int *) futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ > } > } > libc_hidden_def (__lll_lock_wait_private) > > void > -__lll_lock_wait (int *futex, int private) > +__lll_lock_wait (int *futex, int futex_value, int private) > { > - if (atomic_load_relaxed (futex) == 2) > + if (futex_value == 2) > goto futex; > > while (atomic_exchange_acquire (futex, 2) != 0) > { > futex: > - LIBC_PROBE (lll_lock_wait, 1, futex); > + LIBC_PROBE (lll_lock_wait, 2, futex, futex_value); > futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2. */ > } > } > diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h > index 4d95114ed3..4235d13de9 100644 > --- a/sysdeps/nptl/lowlevellock.h > +++ b/sysdeps/nptl/lowlevellock.h > @@ -66,7 +66,11 @@ > 0. Otherwise leave lock unchanged and return non-zero to indicate that the > lock was not acquired. */ > #define __lll_trylock(lock) \ > - __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), 1, 0)) > + (__extension__ ({ \ > + __typeof (*(lock)) __lock_value = atomic_load_relaxed (lock); \ > + (__lock_value != 0 \ > + || atomic_compare_and_exchange_bool_acq ((lock), 1, 0)); \ > + })) > #define lll_trylock(lock) \ > __lll_trylock (&(lock)) > > @@ -74,11 +78,15 @@ > return 0. Otherwise leave lock unchanged and return non-zero to indicate > that the lock was not acquired. */ > #define lll_cond_trylock(lock) \ > - __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0)) > + (__extension__ ({ \ > + __typeof (lock) __lock_value = atomic_load_relaxed (&(lock)); \ > + (__lock_value != 0 \ > + || atomic_compare_and_exchange_bool_acq (&(lock), 2, 0)); \ > + })) > > -extern void __lll_lock_wait_private (int *futex); > +extern void __lll_lock_wait_private (int *futex, int futex_value); > libc_hidden_proto (__lll_lock_wait_private) > -extern void __lll_lock_wait (int *futex, int private); > +extern void __lll_lock_wait (int *futex, int futex_value, int private); > libc_hidden_proto (__lll_lock_wait) > > /* This is an expression rather than a statement even though its value is > @@ -95,13 +103,15 @@ libc_hidden_proto (__lll_lock_wait) > ((void) \ > ({ \ > int *__futex = (futex); \ > - if (__glibc_unlikely \ > - (atomic_compare_and_exchange_bool_acq (__futex, 1, 0))) \ > + int __futex_value = atomic_load_relaxed (futex); \ > + if (__futex_value != 0 \ > + || ((__futex_value = atomic_compare_and_exchange_val_acq \ > + (__futex, 1, 0) != 0))) \ > { \ > if (__builtin_constant_p (private) && (private) == LLL_PRIVATE) \ > - __lll_lock_wait_private (__futex); \ > + __lll_lock_wait_private (futex, __futex_value); \ > else \ > - __lll_lock_wait (__futex, private); \ > + __lll_lock_wait (futex, __futex_value, private); \ > } \ > })) > #define lll_lock(futex, private) \ > @@ -120,7 +130,8 @@ libc_hidden_proto (__lll_lock_wait) > ({ \ > int *__futex = (futex); \ > if (__glibc_unlikely (atomic_exchange_acq (__futex, 2) != 0)) \ > - __lll_lock_wait (__futex, private); \ > + __lll_lock_wait (__futex, atomic_load_relaxed (__futex), \ > + private); \ > })) > #define lll_cond_lock(futex, private) __lll_cond_lock (&(futex), private) > > -- > 2.33.1 >
diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c index 8c740529c4..d1965c01ca 100644 --- a/nptl/lowlevellock.c +++ b/nptl/lowlevellock.c @@ -22,30 +22,30 @@ #include <stap-probe.h> void -__lll_lock_wait_private (int *futex) +__lll_lock_wait_private (int *futex, int futex_value) { - if (atomic_load_relaxed (futex) == 2) + if (futex_value == 2) goto futex; while (atomic_exchange_acquire (futex, 2) != 0) { futex: - LIBC_PROBE (lll_lock_wait_private, 1, futex); + LIBC_PROBE (lll_lock_wait_private, 2, futex, futex_value); futex_wait ((unsigned int *) futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ } } libc_hidden_def (__lll_lock_wait_private) void -__lll_lock_wait (int *futex, int private) +__lll_lock_wait (int *futex, int futex_value, int private) { - if (atomic_load_relaxed (futex) == 2) + if (futex_value == 2) goto futex; while (atomic_exchange_acquire (futex, 2) != 0) { futex: - LIBC_PROBE (lll_lock_wait, 1, futex); + LIBC_PROBE (lll_lock_wait, 2, futex, futex_value); futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2. */ } } diff --git a/sysdeps/nptl/lowlevellock.h b/sysdeps/nptl/lowlevellock.h index 4d95114ed3..4235d13de9 100644 --- a/sysdeps/nptl/lowlevellock.h +++ b/sysdeps/nptl/lowlevellock.h @@ -66,7 +66,11 @@ 0. Otherwise leave lock unchanged and return non-zero to indicate that the lock was not acquired. */ #define __lll_trylock(lock) \ - __glibc_unlikely (atomic_compare_and_exchange_bool_acq ((lock), 1, 0)) + (__extension__ ({ \ + __typeof (*(lock)) __lock_value = atomic_load_relaxed (lock); \ + (__lock_value != 0 \ + || atomic_compare_and_exchange_bool_acq ((lock), 1, 0)); \ + })) #define lll_trylock(lock) \ __lll_trylock (&(lock)) @@ -74,11 +78,15 @@ return 0. Otherwise leave lock unchanged and return non-zero to indicate that the lock was not acquired. */ #define lll_cond_trylock(lock) \ - __glibc_unlikely (atomic_compare_and_exchange_bool_acq (&(lock), 2, 0)) + (__extension__ ({ \ + __typeof (lock) __lock_value = atomic_load_relaxed (&(lock)); \ + (__lock_value != 0 \ + || atomic_compare_and_exchange_bool_acq (&(lock), 2, 0)); \ + })) -extern void __lll_lock_wait_private (int *futex); +extern void __lll_lock_wait_private (int *futex, int futex_value); libc_hidden_proto (__lll_lock_wait_private) -extern void __lll_lock_wait (int *futex, int private); +extern void __lll_lock_wait (int *futex, int futex_value, int private); libc_hidden_proto (__lll_lock_wait) /* This is an expression rather than a statement even though its value is @@ -95,13 +103,15 @@ libc_hidden_proto (__lll_lock_wait) ((void) \ ({ \ int *__futex = (futex); \ - if (__glibc_unlikely \ - (atomic_compare_and_exchange_bool_acq (__futex, 1, 0))) \ + int __futex_value = atomic_load_relaxed (futex); \ + if (__futex_value != 0 \ + || ((__futex_value = atomic_compare_and_exchange_val_acq \ + (__futex, 1, 0) != 0))) \ { \ if (__builtin_constant_p (private) && (private) == LLL_PRIVATE) \ - __lll_lock_wait_private (__futex); \ + __lll_lock_wait_private (futex, __futex_value); \ else \ - __lll_lock_wait (__futex, private); \ + __lll_lock_wait (futex, __futex_value, private); \ } \ })) #define lll_lock(futex, private) \ @@ -120,7 +130,8 @@ libc_hidden_proto (__lll_lock_wait) ({ \ int *__futex = (futex); \ if (__glibc_unlikely (atomic_exchange_acq (__futex, 2) != 0)) \ - __lll_lock_wait (__futex, private); \ + __lll_lock_wait (__futex, atomic_load_relaxed (__futex), \ + private); \ })) #define lll_cond_lock(futex, private) __lll_cond_lock (&(futex), private)