[RFC] Enable RTM lock elision in current rwlock

Submitted by Andrew Senkevich on April 5, 2017, 9:23 a.m.

Details

Message ID CAMXFM3tui-xJ1pGWuK_YJS3XOR+KV4PL7NeKpgPJ_KC9pRARxw@mail.gmail.com
State New
Headers show

Commit Message

Andrew Senkevich April 5, 2017, 9:23 a.m.
Hi,

here is the patch to enable RTM lock elision in current rwlock implementation.

Patch adds enabling itself using existing wrappers macro with new
conditions and fixes typo in elision tuning.

ChangeLog:

        * nptl/pthread_rwlock_rdlock.c: Add RTM lock elision.
        * nptl/pthread_rwlock_tryrdlock.c: Likewise.
        * nptl/pthread_rwlock_trywrlock.c: Likewise.
        * nptl/pthread_rwlock_wrlock.c: Likewise.
        * nptl/pthread_rwlock_unlock.c: Add RTM unlock elision.
        * sysdeps/x86/elide.h (ELIDE_LOCK): Fixed typo.


Performance was measured with bench-multithread.c posted at
https://sourceware.org/ml/libc-alpha/2017-01/msg00168.html on i7-6700
(Skylake).

With R=60 and shared W=0 lock elision helps to increase scalability
with some slowdown in single-thread case:
w/o patch:
threads=1  iter/s=        4060317  timing_t=17040157458
threads=2  iter/s=        6752786  timing_t=17040102928
threads=3  iter/s=        9392445  timing_t=17040102070
threads=4  iter/s=       10463599  timing_t=17040103938

with patch:
threads=1  iter/s=        3402230  timing_t=17040139858
threads=2  iter/s=        6784424  timing_t=17040102620
threads=3  iter/s=       10157296  timing_t=17040103664
threads=4  iter/s=       13509321  timing_t=17040108804

But with R=60 and shared W=1 we have not so good picture:
w/o patch:
threads=1  iter/s=        3939640  timing_t=17040142322
threads=2  iter/s=        6018330  timing_t=17040101032
threads=3  iter/s=        7598935  timing_t=17040101858
threads=4  iter/s=        7873063  timing_t=17040107686

with patch:
threads=1  iter/s=        3317374  timing_t=17040140564
threads=2  iter/s=        4555148  timing_t=17040106910
threads=3  iter/s=        6583628  timing_t=17040105330
threads=4  iter/s=        7421028  timing_t=17040101884
(perf stat -T tells we have 9.44% transactional cycles and 3.84% aborted cycles)

I have tried to tune lock elision and it change performance but
haven't achieve results better or equal than w/o patch:

with patch and tuning:
threads=1  iter/s=        3324805  timing_t=17040143918
threads=2  iter/s=        5360456  timing_t=17040100294
threads=3  iter/s=        6997841  timing_t=17040107640
threads=4  iter/s=        7406136  timing_t=17040104018
(perf stat -T tells we have 5.84% transactional cycles and 0.46% aborted cycles)

Is condition for wrapper macro enough correct?
Maybe it have sence to make tuning configurable by tunables and have
additional switch for turn it on/off?


--
WBR,
Andrew

Comments

Andrew Senkevich June 13, 2017, 10:50 a.m.
2017-04-05 11:23 GMT+02:00 Andrew Senkevich <andrew.n.senkevich@gmail.com>:
> Hi,
>
> here is the patch to enable RTM lock elision in current rwlock implementation.
>
> Patch adds enabling itself using existing wrappers macro with new
> conditions and fixes typo in elision tuning.
>
> ChangeLog:
>
>         * nptl/pthread_rwlock_rdlock.c: Add RTM lock elision.
>         * nptl/pthread_rwlock_tryrdlock.c: Likewise.
>         * nptl/pthread_rwlock_trywrlock.c: Likewise.
>         * nptl/pthread_rwlock_wrlock.c: Likewise.
>         * nptl/pthread_rwlock_unlock.c: Add RTM unlock elision.
>         * sysdeps/x86/elide.h (ELIDE_LOCK): Fixed typo.
>
> diff --git a/nptl/pthread_rwlock_rdlock.c b/nptl/pthread_rwlock_rdlock.c
> index e07581b..3073dfc 100644
> --- a/nptl/pthread_rwlock_rdlock.c
> +++ b/nptl/pthread_rwlock_rdlock.c
> @@ -16,6 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>
> +#include <elide.h>
> +
>  #include "pthread_rwlock_common.c"
>
>  /* See pthread_rwlock_common.c.  */
> @@ -24,6 +26,12 @@ __pthread_rwlock_rdlock (pthread_rwlock_t *rwlock)
>  {
>    LIBC_PROBE (rdlock_entry, 1, rwlock);
>
> +  if (ELIDE_LOCK (rwlock->__data.__rwelision,
> +  ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> +  && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +  && rwlock->__data.__writers_futex == 0))
> +    return 0;
> +
>    int result = __pthread_rwlock_rdlock_full (rwlock, NULL);
>    LIBC_PROBE (rdlock_acquire_read, 1, rwlock);
>    return result;
> diff --git a/nptl/pthread_rwlock_tryrdlock.c b/nptl/pthread_rwlock_tryrdlock.c
> index 6c3014c..7cff149 100644
> --- a/nptl/pthread_rwlock_tryrdlock.c
> +++ b/nptl/pthread_rwlock_tryrdlock.c
> @@ -20,6 +20,8 @@
>  #include "pthreadP.h"
>  #include <atomic.h>
>  #include <stdbool.h>
> +#include <elide.h>
> +
>  #include "pthread_rwlock_common.c"
>
>
> @@ -27,6 +29,12 @@
>  int
>  __pthread_rwlock_tryrdlock (pthread_rwlock_t *rwlock)
>  {
> +  if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
> +     ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> +     && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +     && rwlock->__data.__writers_futex, 0))
> +    return 0;
> +
>    /* For tryrdlock, we could speculate that we will succeed and go ahead and
>       register as a reader.  However, if we misspeculate, we have to do the
>       same steps as a timed-out rdlock, which will increase contention.
> diff --git a/nptl/pthread_rwlock_trywrlock.c b/nptl/pthread_rwlock_trywrlock.c
> index 0d9ccaf..dce32da 100644
> --- a/nptl/pthread_rwlock_trywrlock.c
> +++ b/nptl/pthread_rwlock_trywrlock.c
> @@ -19,11 +19,18 @@
>  #include <errno.h>
>  #include "pthreadP.h"
>  #include <atomic.h>
> +#include <elide.h>
>
>  /* See pthread_rwlock_common.c for an overview.  */
>  int
>  __pthread_rwlock_trywrlock (pthread_rwlock_t *rwlock)
>  {
> +  if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
> +     ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> +     && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +     && rwlock->__data.__writers_futex, 1))
> +    return 0;
> +
>    /* When in a trywrlock, we can acquire the write lock if it is in states
>       #1 (idle and read phase) and #5 (idle and write phase), and also in #6
>       (readers waiting, write phase) if we prefer writers.
> diff --git a/nptl/pthread_rwlock_unlock.c b/nptl/pthread_rwlock_unlock.c
> index ef46e88..fa4555d 100644
> --- a/nptl/pthread_rwlock_unlock.c
> +++ b/nptl/pthread_rwlock_unlock.c
> @@ -22,6 +22,7 @@
>  #include <pthread.h>
>  #include <pthreadP.h>
>  #include <stap-probe.h>
> +#include <elide.h>
>
>  #include "pthread_rwlock_common.c"
>
> @@ -31,6 +32,10 @@ __pthread_rwlock_unlock (pthread_rwlock_t *rwlock)
>  {
>    LIBC_PROBE (rwlock_unlock, 1, rwlock);
>
> +  if (ELIDE_UNLOCK ((rwlock->__data.__readers >>
> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +    && rwlock->__data.__writers_futex == 0))
> +    return 0;
> +
>    /* We distinguish between having acquired a read vs. a write lock by looking
>       at the writer TID.  If it's equal to our TID, we must be the writer
>       because nobody else can have stored this value.  Also, if we are a
> diff --git a/nptl/pthread_rwlock_wrlock.c b/nptl/pthread_rwlock_wrlock.c
> index 335fcd1..d041aad 100644
> --- a/nptl/pthread_rwlock_wrlock.c
> +++ b/nptl/pthread_rwlock_wrlock.c
> @@ -16,6 +16,8 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>
> +#include <elide.h>
> +
>  #include "pthread_rwlock_common.c"
>
>  /* See pthread_rwlock_common.c.  */
> @@ -24,6 +26,12 @@ __pthread_rwlock_wrlock (pthread_rwlock_t *rwlock)
>  {
>    LIBC_PROBE (wrlock_entry, 1, rwlock);
>
> +  if (ELIDE_LOCK (rwlock->__data.__rwelision,
> +  ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
> (rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
> +  && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
> +  && rwlock->__data.__writers_futex == 0))
> +    return 0;
> +
>    int result = __pthread_rwlock_wrlock_full (rwlock, NULL);
>    LIBC_PROBE (wrlock_acquire_write, 1, rwlock);
>    return result;
> diff --git a/sysdeps/x86/elide.h b/sysdeps/x86/elide.h
> index 53de418..3eaf15c 100644
> --- a/sysdeps/x86/elide.h
> +++ b/sysdeps/x86/elide.h
> @@ -77,7 +77,7 @@ elision_adapt(signed char *adapt_count, unsigned int status)
>            } \
>          _xabort (_ABORT_LOCK_BUSY); \
>        } \
> -    if (!elision_adapt (&(adapt_count), status)) \
> +    if (elision_adapt (&(adapt_count), status)) \
>        break; \
>            } \
>        } \
>
> Performance was measured with bench-multithread.c posted at
> https://sourceware.org/ml/libc-alpha/2017-01/msg00168.html on i7-6700
> (Skylake).
>
> With R=60 and shared W=0 lock elision helps to increase scalability
> with some slowdown in single-thread case:
> w/o patch:
> threads=1  iter/s=        4060317  timing_t=17040157458
> threads=2  iter/s=        6752786  timing_t=17040102928
> threads=3  iter/s=        9392445  timing_t=17040102070
> threads=4  iter/s=       10463599  timing_t=17040103938
>
> with patch:
> threads=1  iter/s=        3402230  timing_t=17040139858
> threads=2  iter/s=        6784424  timing_t=17040102620
> threads=3  iter/s=       10157296  timing_t=17040103664
> threads=4  iter/s=       13509321  timing_t=17040108804
>
> But with R=60 and shared W=1 we have not so good picture:
> w/o patch:
> threads=1  iter/s=        3939640  timing_t=17040142322
> threads=2  iter/s=        6018330  timing_t=17040101032
> threads=3  iter/s=        7598935  timing_t=17040101858
> threads=4  iter/s=        7873063  timing_t=17040107686
>
> with patch:
> threads=1  iter/s=        3317374  timing_t=17040140564
> threads=2  iter/s=        4555148  timing_t=17040106910
> threads=3  iter/s=        6583628  timing_t=17040105330
> threads=4  iter/s=        7421028  timing_t=17040101884
> (perf stat -T tells we have 9.44% transactional cycles and 3.84% aborted cycles)
>
> I have tried to tune lock elision and it change performance but
> haven't achieve results better or equal than w/o patch:
>
> with patch and tuning:
> threads=1  iter/s=        3324805  timing_t=17040143918
> threads=2  iter/s=        5360456  timing_t=17040100294
> threads=3  iter/s=        6997841  timing_t=17040107640
> threads=4  iter/s=        7406136  timing_t=17040104018
> (perf stat -T tells we have 5.84% transactional cycles and 0.46% aborted cycles)
>
> Is condition for wrapper macro enough correct?
> Maybe it have sence to make tuning configurable by tunables and have
> additional switch for turn it on/off?

Ping.

Just a note that here are two parts, one part is only typo fix:

>         * sysdeps/x86/elide.h (ELIDE_LOCK): Fix typo.
>
> diff --git a/sysdeps/x86/elide.h b/sysdeps/x86/elide.h
> index 53de418..3eaf15c 100644
> --- a/sysdeps/x86/elide.h
> +++ b/sysdeps/x86/elide.h
> @@ -77,7 +77,7 @@ elision_adapt(signed char *adapt_count, unsigned int status)
>            } \
>          _xabort (_ABORT_LOCK_BUSY); \
>        } \
> -    if (!elision_adapt (&(adapt_count), status)) \
> +    if (elision_adapt (&(adapt_count), status)) \
>        break; \
>            } \
>        } \


--
WBR,
Andrew

Patch hide | download patch | download mbox

diff --git a/nptl/pthread_rwlock_rdlock.c b/nptl/pthread_rwlock_rdlock.c
index e07581b..3073dfc 100644
--- a/nptl/pthread_rwlock_rdlock.c
+++ b/nptl/pthread_rwlock_rdlock.c
@@ -16,6 +16,8 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */

+#include <elide.h>
+
 #include "pthread_rwlock_common.c"

 /* See pthread_rwlock_common.c.  */
@@ -24,6 +26,12 @@  __pthread_rwlock_rdlock (pthread_rwlock_t *rwlock)
 {
   LIBC_PROBE (rdlock_entry, 1, rwlock);

+  if (ELIDE_LOCK (rwlock->__data.__rwelision,
+  ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
(rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
+  && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
+  && rwlock->__data.__writers_futex == 0))
+    return 0;
+
   int result = __pthread_rwlock_rdlock_full (rwlock, NULL);
   LIBC_PROBE (rdlock_acquire_read, 1, rwlock);
   return result;
diff --git a/nptl/pthread_rwlock_tryrdlock.c b/nptl/pthread_rwlock_tryrdlock.c
index 6c3014c..7cff149 100644
--- a/nptl/pthread_rwlock_tryrdlock.c
+++ b/nptl/pthread_rwlock_tryrdlock.c
@@ -20,6 +20,8 @@ 
 #include "pthreadP.h"
 #include <atomic.h>
 #include <stdbool.h>
+#include <elide.h>
+
 #include "pthread_rwlock_common.c"


@@ -27,6 +29,12 @@ 
 int
 __pthread_rwlock_tryrdlock (pthread_rwlock_t *rwlock)
 {
+  if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
+     ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
(rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
+     && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
+     && rwlock->__data.__writers_futex, 0))
+    return 0;
+
   /* For tryrdlock, we could speculate that we will succeed and go ahead and
      register as a reader.  However, if we misspeculate, we have to do the
      same steps as a timed-out rdlock, which will increase contention.
diff --git a/nptl/pthread_rwlock_trywrlock.c b/nptl/pthread_rwlock_trywrlock.c
index 0d9ccaf..dce32da 100644
--- a/nptl/pthread_rwlock_trywrlock.c
+++ b/nptl/pthread_rwlock_trywrlock.c
@@ -19,11 +19,18 @@ 
 #include <errno.h>
 #include "pthreadP.h"
 #include <atomic.h>
+#include <elide.h>

 /* See pthread_rwlock_common.c for an overview.  */
 int
 __pthread_rwlock_trywrlock (pthread_rwlock_t *rwlock)
 {
+  if (ELIDE_TRYLOCK (rwlock->__data.__rwelision,
+     ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
(rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
+     && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
+     && rwlock->__data.__writers_futex, 1))
+    return 0;
+
   /* When in a trywrlock, we can acquire the write lock if it is in states
      #1 (idle and read phase) and #5 (idle and write phase), and also in #6
      (readers waiting, write phase) if we prefer writers.
diff --git a/nptl/pthread_rwlock_unlock.c b/nptl/pthread_rwlock_unlock.c
index ef46e88..fa4555d 100644
--- a/nptl/pthread_rwlock_unlock.c
+++ b/nptl/pthread_rwlock_unlock.c
@@ -22,6 +22,7 @@ 
 #include <pthread.h>
 #include <pthreadP.h>
 #include <stap-probe.h>
+#include <elide.h>

 #include "pthread_rwlock_common.c"

@@ -31,6 +32,10 @@  __pthread_rwlock_unlock (pthread_rwlock_t *rwlock)
 {
   LIBC_PROBE (rwlock_unlock, 1, rwlock);

+  if (ELIDE_UNLOCK ((rwlock->__data.__readers >>
PTHREAD_RWLOCK_READER_SHIFT) == 0
+    && rwlock->__data.__writers_futex == 0))
+    return 0;
+
   /* We distinguish between having acquired a read vs. a write lock by looking
      at the writer TID.  If it's equal to our TID, we must be the writer
      because nobody else can have stored this value.  Also, if we are a
diff --git a/nptl/pthread_rwlock_wrlock.c b/nptl/pthread_rwlock_wrlock.c
index 335fcd1..d041aad 100644
--- a/nptl/pthread_rwlock_wrlock.c
+++ b/nptl/pthread_rwlock_wrlock.c
@@ -16,6 +16,8 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */

+#include <elide.h>
+
 #include "pthread_rwlock_common.c"

 /* See pthread_rwlock_common.c.  */
@@ -24,6 +26,12 @@  __pthread_rwlock_wrlock (pthread_rwlock_t *rwlock)
 {
   LIBC_PROBE (wrlock_entry, 1, rwlock);

+  if (ELIDE_LOCK (rwlock->__data.__rwelision,
+  ((rwlock->__data.__readers & PTHREAD_RWLOCK_WRPHASE) &&
(rwlock->__data.__readers & PTHREAD_RWLOCK_WRLOCKED)) == 0
+  && (rwlock->__data.__readers >> PTHREAD_RWLOCK_READER_SHIFT) == 0
+  && rwlock->__data.__writers_futex == 0))
+    return 0;
+
   int result = __pthread_rwlock_wrlock_full (rwlock, NULL);
   LIBC_PROBE (wrlock_acquire_write, 1, rwlock);
   return result;
diff --git a/sysdeps/x86/elide.h b/sysdeps/x86/elide.h
index 53de418..3eaf15c 100644
--- a/sysdeps/x86/elide.h
+++ b/sysdeps/x86/elide.h
@@ -77,7 +77,7 @@  elision_adapt(signed char *adapt_count, unsigned int status)
           } \
         _xabort (_ABORT_LOCK_BUSY); \
       } \
-    if (!elision_adapt (&(adapt_count), status)) \
+    if (elision_adapt (&(adapt_count), status)) \
       break; \
           } \
       } \