diff mbox series

[v2,3/4] Remove usage of TLS_MULTIPLE_THREADS_IN_TCB

Message ID 20220610163552.3587064-4-adhemerval.zanella@linaro.org
State New
Headers show
Series Simplify internal single-threaded usage | expand

Commit Message

Adhemerval Zanella Netto June 10, 2022, 4:35 p.m. UTC
Instead use __libc_single_threaded on all architectures.  The TCB
field is renamed to avoid change the struct layout.

The x86 atomic need some adjustments since it has single-thread
optimizationi builtin within the inline assemblye.  It now uses
SINGLE_THREAD_P and atomic optimizations are removed (since they
are not used).

Checked on x86_64-linux-gnu and i686-linux-gnu.
---
 misc/tst-atomic.c                       |   1 +
 nptl/allocatestack.c                    |   6 -
 nptl/descr.h                            |  17 +-
 nptl/pthread_cancel.c                   |   7 +-
 nptl/pthread_create.c                   |   5 -
 sysdeps/i386/htl/tcb-offsets.sym        |   1 -
 sysdeps/i386/nptl/tcb-offsets.sym       |   1 -
 sysdeps/i386/nptl/tls.h                 |   4 +-
 sysdeps/ia64/nptl/tcb-offsets.sym       |   1 -
 sysdeps/ia64/nptl/tls.h                 |   2 -
 sysdeps/mach/hurd/i386/tls.h            |   4 +-
 sysdeps/nios2/nptl/tcb-offsets.sym      |   1 -
 sysdeps/or1k/nptl/tls.h                 |   2 -
 sysdeps/powerpc/nptl/tcb-offsets.sym    |   3 -
 sysdeps/powerpc/nptl/tls.h              |   3 -
 sysdeps/s390/nptl/tcb-offsets.sym       |   1 -
 sysdeps/s390/nptl/tls.h                 |   6 +-
 sysdeps/sh/nptl/tcb-offsets.sym         |   1 -
 sysdeps/sh/nptl/tls.h                   |   2 -
 sysdeps/sparc/nptl/tcb-offsets.sym      |   1 -
 sysdeps/sparc/nptl/tls.h                |   2 +-
 sysdeps/unix/sysv/linux/single-thread.h |  15 +-
 sysdeps/x86/atomic-machine.h            | 484 +++++++-----------------
 sysdeps/x86_64/nptl/tcb-offsets.sym     |   1 -
 24 files changed, 145 insertions(+), 426 deletions(-)

Comments

H.J. Lu June 10, 2022, 7:49 p.m. UTC | #1
On Fri, Jun 10, 2022 at 9:40 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Instead use __libc_single_threaded on all architectures.  The TCB
> field is renamed to avoid change the struct layout.
>
> The x86 atomic need some adjustments since it has single-thread
> optimizationi builtin within the inline assemblye.  It now uses
> SINGLE_THREAD_P and atomic optimizations are removed (since they
> are not used).
>
> Checked on x86_64-linux-gnu and i686-linux-gnu.
> ---
>  misc/tst-atomic.c                       |   1 +
>  nptl/allocatestack.c                    |   6 -
>  nptl/descr.h                            |  17 +-
>  nptl/pthread_cancel.c                   |   7 +-
>  nptl/pthread_create.c                   |   5 -
>  sysdeps/i386/htl/tcb-offsets.sym        |   1 -
>  sysdeps/i386/nptl/tcb-offsets.sym       |   1 -
>  sysdeps/i386/nptl/tls.h                 |   4 +-
>  sysdeps/ia64/nptl/tcb-offsets.sym       |   1 -
>  sysdeps/ia64/nptl/tls.h                 |   2 -
>  sysdeps/mach/hurd/i386/tls.h            |   4 +-
>  sysdeps/nios2/nptl/tcb-offsets.sym      |   1 -
>  sysdeps/or1k/nptl/tls.h                 |   2 -
>  sysdeps/powerpc/nptl/tcb-offsets.sym    |   3 -
>  sysdeps/powerpc/nptl/tls.h              |   3 -
>  sysdeps/s390/nptl/tcb-offsets.sym       |   1 -
>  sysdeps/s390/nptl/tls.h                 |   6 +-
>  sysdeps/sh/nptl/tcb-offsets.sym         |   1 -
>  sysdeps/sh/nptl/tls.h                   |   2 -
>  sysdeps/sparc/nptl/tcb-offsets.sym      |   1 -
>  sysdeps/sparc/nptl/tls.h                |   2 +-
>  sysdeps/unix/sysv/linux/single-thread.h |  15 +-
>  sysdeps/x86/atomic-machine.h            | 484 +++++++-----------------
>  sysdeps/x86_64/nptl/tcb-offsets.sym     |   1 -
>  24 files changed, 145 insertions(+), 426 deletions(-)
>
> diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
> index 6d681a7bfd..ddbc618e25 100644
> --- a/misc/tst-atomic.c
> +++ b/misc/tst-atomic.c
> @@ -18,6 +18,7 @@
>
>  #include <stdio.h>
>  #include <atomic.h>
> +#include <support/xthread.h>
>
>  #ifndef atomic_t
>  # define atomic_t int
> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index 98f5f6dd85..3e0d01cb52 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -290,9 +290,6 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>          stack cache nor will the memory (except the TLS memory) be freed.  */
>        pd->user_stack = true;
>
> -      /* This is at least the second thread.  */
> -      pd->header.multiple_threads = 1;
> -
>  #ifdef NEED_DL_SYSINFO
>        SETUP_THREAD_SYSINFO (pd);
>  #endif
> @@ -408,9 +405,6 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>              descriptor.  */
>           pd->specific[0] = pd->specific_1stblock;
>
> -         /* This is at least the second thread.  */
> -         pd->header.multiple_threads = 1;
> -
>  #ifdef NEED_DL_SYSINFO
>           SETUP_THREAD_SYSINFO (pd);
>  #endif
> diff --git a/nptl/descr.h b/nptl/descr.h
> index bb46b5958e..77b25d8267 100644
> --- a/nptl/descr.h
> +++ b/nptl/descr.h
> @@ -137,22 +137,7 @@ struct pthread
>  #else
>      struct
>      {
> -      /* multiple_threads is enabled either when the process has spawned at
> -        least one thread or when a single-threaded process cancels itself.
> -        This enables additional code to introduce locking before doing some
> -        compare_and_exchange operations and also enable cancellation points.
> -        The concepts of multiple threads and cancellation points ideally
> -        should be separate, since it is not necessary for multiple threads to
> -        have been created for cancellation points to be enabled, as is the
> -        case is when single-threaded process cancels itself.
> -
> -        Since enabling multiple_threads enables additional code in
> -        cancellation points and compare_and_exchange operations, there is a
> -        potential for an unneeded performance hit when it is enabled in a
> -        single-threaded, self-canceling process.  This is OK though, since a
> -        single-threaded process will enable async cancellation only when it
> -        looks to cancel itself and is hence going to end anyway.  */
> -      int multiple_threads;
> +      int unused_multiple_threads;
>        int gscope_flag;
>      } header;
>  #endif
> diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
> index e1735279f2..6d26a15d0e 100644
> --- a/nptl/pthread_cancel.c
> +++ b/nptl/pthread_cancel.c
> @@ -157,12 +157,9 @@ __pthread_cancel (pthread_t th)
>
>         /* A single-threaded process should be able to kill itself, since
>            there is nothing in the POSIX specification that says that it
> -          cannot.  So we set multiple_threads to true so that cancellation
> -          points get executed.  */
> -       THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
> -#ifndef TLS_MULTIPLE_THREADS_IN_TCB
> +          cannot.  So we set __libc_single_threaded to true so that
> +          cancellation points get executed.  */
>         __libc_single_threaded = 0;
> -#endif
>      }
>    while (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling, &oldval,
>                                                 newval));
> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 5633d01c62..d43865352f 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -882,11 +882,6 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
>            other reason that create_thread chose.  Now let it run
>            free.  */
>         lll_unlock (pd->lock, LLL_PRIVATE);
> -
> -      /* We now have for sure more than one thread.  The main thread might
> -        not yet have the flag set.  No need to set the global variable
> -        again if this is what we use.  */
> -      THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
>      }
>
>   out:
> diff --git a/sysdeps/i386/htl/tcb-offsets.sym b/sysdeps/i386/htl/tcb-offsets.sym
> index 7b7c719369..f3f7df6c06 100644
> --- a/sysdeps/i386/htl/tcb-offsets.sym
> +++ b/sysdeps/i386/htl/tcb-offsets.sym
> @@ -2,7 +2,6 @@
>  #include <tls.h>
>  #include <kernel-features.h>
>
> -MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads)
>  SYSINFO_OFFSET          offsetof (tcbhead_t, sysinfo)
>  POINTER_GUARD           offsetof (tcbhead_t, pointer_guard)
>  SIGSTATE_OFFSET         offsetof (tcbhead_t, _hurd_sigstate)
> diff --git a/sysdeps/i386/nptl/tcb-offsets.sym b/sysdeps/i386/nptl/tcb-offsets.sym
> index 2ec9e787c1..1efd1469d8 100644
> --- a/sysdeps/i386/nptl/tcb-offsets.sym
> +++ b/sysdeps/i386/nptl/tcb-offsets.sym
> @@ -6,7 +6,6 @@ RESULT                  offsetof (struct pthread, result)
>  TID                    offsetof (struct pthread, tid)
>  CANCELHANDLING         offsetof (struct pthread, cancelhandling)
>  CLEANUP_JMP_BUF                offsetof (struct pthread, cleanup_jmp_buf)
> -MULTIPLE_THREADS_OFFSET        offsetof (tcbhead_t, multiple_threads)
>  SYSINFO_OFFSET         offsetof (tcbhead_t, sysinfo)
>  CLEANUP                        offsetof (struct pthread, cleanup)
>  CLEANUP_PREV           offsetof (struct _pthread_cleanup_buffer, __prev)
> diff --git a/sysdeps/i386/nptl/tls.h b/sysdeps/i386/nptl/tls.h
> index 91090bf287..48940a9f44 100644
> --- a/sysdeps/i386/nptl/tls.h
> +++ b/sysdeps/i386/nptl/tls.h
> @@ -36,7 +36,7 @@ typedef struct
>                            thread descriptor used by libpthread.  */
>    dtv_t *dtv;
>    void *self;          /* Pointer to the thread descriptor.  */
> -  int multiple_threads;
> +  int unused_multiple_threads;
>    uintptr_t sysinfo;
>    uintptr_t stack_guard;
>    uintptr_t pointer_guard;
> @@ -57,8 +57,6 @@ typedef struct
>  _Static_assert (offsetof (tcbhead_t, __private_ss) == 0x30,
>                 "offset of __private_ss != 0x30");
>
> -# define TLS_MULTIPLE_THREADS_IN_TCB 1
> -
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>
>  #endif
> diff --git a/sysdeps/ia64/nptl/tcb-offsets.sym b/sysdeps/ia64/nptl/tcb-offsets.sym
> index b01f712be2..ab2cb180f9 100644
> --- a/sysdeps/ia64/nptl/tcb-offsets.sym
> +++ b/sysdeps/ia64/nptl/tcb-offsets.sym
> @@ -2,5 +2,4 @@
>  #include <tls.h>
>
>  TID                    offsetof (struct pthread, tid) - TLS_PRE_TCB_SIZE
> -MULTIPLE_THREADS_OFFSET offsetof (struct pthread, header.multiple_threads) - TLS_PRE_TCB_SIZE
>  SYSINFO_OFFSET         offsetof (tcbhead_t, __private)
> diff --git a/sysdeps/ia64/nptl/tls.h b/sysdeps/ia64/nptl/tls.h
> index 8ccedb73e6..008e080fc4 100644
> --- a/sysdeps/ia64/nptl/tls.h
> +++ b/sysdeps/ia64/nptl/tls.h
> @@ -36,8 +36,6 @@ typedef struct
>
>  register struct pthread *__thread_self __asm__("r13");
>
> -# define TLS_MULTIPLE_THREADS_IN_TCB 1
> -
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>
>  #endif
> diff --git a/sysdeps/mach/hurd/i386/tls.h b/sysdeps/mach/hurd/i386/tls.h
> index 264ed9a9c5..d33e91c922 100644
> --- a/sysdeps/mach/hurd/i386/tls.h
> +++ b/sysdeps/mach/hurd/i386/tls.h
> @@ -33,7 +33,7 @@ typedef struct
>    void *tcb;                   /* Points to this structure.  */
>    dtv_t *dtv;                  /* Vector of pointers to TLS data.  */
>    thread_t self;               /* This thread's control port.  */
> -  int multiple_threads;
> +  int unused_multiple_threads;
>    uintptr_t sysinfo;
>    uintptr_t stack_guard;
>    uintptr_t pointer_guard;
> @@ -117,8 +117,6 @@ _hurd_tls_init (tcbhead_t *tcb)
>    /* This field is used by TLS accesses to get our "thread pointer"
>       from the TLS point of view.  */
>    tcb->tcb = tcb;
> -  /* We always at least start the sigthread anyway.  */
> -  tcb->multiple_threads = 1;
>
>    /* Get the first available selector.  */
>    int sel = -1;
> diff --git a/sysdeps/nios2/nptl/tcb-offsets.sym b/sysdeps/nios2/nptl/tcb-offsets.sym
> index 3cd8d984ac..93a695ac7f 100644
> --- a/sysdeps/nios2/nptl/tcb-offsets.sym
> +++ b/sysdeps/nios2/nptl/tcb-offsets.sym
> @@ -8,6 +8,5 @@
>  # define __thread_self          ((void *) 0)
>  # define thread_offsetof(mem)   ((ptrdiff_t) THREAD_SELF + offsetof (struct pthread, mem))
>
> -MULTIPLE_THREADS_OFFSET                thread_offsetof (header.multiple_threads)
>  TID_OFFSET                     thread_offsetof (tid)
>  POINTER_GUARD                  (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
> diff --git a/sysdeps/or1k/nptl/tls.h b/sysdeps/or1k/nptl/tls.h
> index c6ffe62c3f..3bb07beef8 100644
> --- a/sysdeps/or1k/nptl/tls.h
> +++ b/sysdeps/or1k/nptl/tls.h
> @@ -35,8 +35,6 @@ typedef struct
>
>  register tcbhead_t *__thread_self __asm__("r10");
>
> -# define TLS_MULTIPLE_THREADS_IN_TCB 1
> -
>  /* Get system call information.  */
>  # include <sysdep.h>
>
> diff --git a/sysdeps/powerpc/nptl/tcb-offsets.sym b/sysdeps/powerpc/nptl/tcb-offsets.sym
> index 4c01615ad0..a0ee95f94d 100644
> --- a/sysdeps/powerpc/nptl/tcb-offsets.sym
> +++ b/sysdeps/powerpc/nptl/tcb-offsets.sym
> @@ -10,9 +10,6 @@
>  # define thread_offsetof(mem)  ((ptrdiff_t) THREAD_SELF + offsetof (struct pthread, mem))
>
>
> -#if TLS_MULTIPLE_THREADS_IN_TCB
> -MULTIPLE_THREADS_OFFSET                thread_offsetof (header.multiple_threads)
> -#endif
>  TID                            thread_offsetof (tid)
>  POINTER_GUARD                  (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
>  TAR_SAVE                       (offsetof (tcbhead_t, tar_save) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
> diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
> index 22b0075235..fd5ee51981 100644
> --- a/sysdeps/powerpc/nptl/tls.h
> +++ b/sysdeps/powerpc/nptl/tls.h
> @@ -52,9 +52,6 @@
>  # define TLS_DTV_AT_TP 1
>  # define TLS_TCB_AT_TP 0
>
> -/* We use the multiple_threads field in the pthread struct */
> -#define TLS_MULTIPLE_THREADS_IN_TCB    1
> -
>  /* Get the thread descriptor definition.  */
>  # include <nptl/descr.h>
>
> diff --git a/sysdeps/s390/nptl/tcb-offsets.sym b/sysdeps/s390/nptl/tcb-offsets.sym
> index 9c1c01f353..bc7b267463 100644
> --- a/sysdeps/s390/nptl/tcb-offsets.sym
> +++ b/sysdeps/s390/nptl/tcb-offsets.sym
> @@ -1,6 +1,5 @@
>  #include <sysdep.h>
>  #include <tls.h>
>
> -MULTIPLE_THREADS_OFFSET                offsetof (tcbhead_t, multiple_threads)
>  STACK_GUARD                    offsetof (tcbhead_t, stack_guard)
>  TID                            offsetof (struct pthread, tid)
> diff --git a/sysdeps/s390/nptl/tls.h b/sysdeps/s390/nptl/tls.h
> index ff210ffeb2..d69ed539f7 100644
> --- a/sysdeps/s390/nptl/tls.h
> +++ b/sysdeps/s390/nptl/tls.h
> @@ -35,7 +35,7 @@ typedef struct
>                            thread descriptor used by libpthread.  */
>    dtv_t *dtv;
>    void *self;          /* Pointer to the thread descriptor.  */
> -  int multiple_threads;
> +  int unused_multiple_threads;
>    uintptr_t sysinfo;
>    uintptr_t stack_guard;
>    int gscope_flag;
> @@ -44,10 +44,6 @@ typedef struct
>    void *__private_ss;
>  } tcbhead_t;
>
> -# ifndef __s390x__
> -#  define TLS_MULTIPLE_THREADS_IN_TCB 1
> -# endif
> -
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>
>  #endif
> diff --git a/sysdeps/sh/nptl/tcb-offsets.sym b/sysdeps/sh/nptl/tcb-offsets.sym
> index 234207779d..4e452d9c6c 100644
> --- a/sysdeps/sh/nptl/tcb-offsets.sym
> +++ b/sysdeps/sh/nptl/tcb-offsets.sym
> @@ -6,7 +6,6 @@ RESULT                  offsetof (struct pthread, result)
>  TID                    offsetof (struct pthread, tid)
>  CANCELHANDLING         offsetof (struct pthread, cancelhandling)
>  CLEANUP_JMP_BUF                offsetof (struct pthread, cleanup_jmp_buf)
> -MULTIPLE_THREADS_OFFSET        offsetof (struct pthread, header.multiple_threads)
>  TLS_PRE_TCB_SIZE       sizeof (struct pthread)
>  MUTEX_FUTEX            offsetof (pthread_mutex_t, __data.__lock)
>  POINTER_GUARD          offsetof (tcbhead_t, pointer_guard)
> diff --git a/sysdeps/sh/nptl/tls.h b/sysdeps/sh/nptl/tls.h
> index 76591ab6ef..8778cb4ac0 100644
> --- a/sysdeps/sh/nptl/tls.h
> +++ b/sysdeps/sh/nptl/tls.h
> @@ -36,8 +36,6 @@ typedef struct
>    uintptr_t pointer_guard;
>  } tcbhead_t;
>
> -# define TLS_MULTIPLE_THREADS_IN_TCB 1
> -
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>
>  #endif /* __ASSEMBLER__ */
> diff --git a/sysdeps/sparc/nptl/tcb-offsets.sym b/sysdeps/sparc/nptl/tcb-offsets.sym
> index f75d02065e..e4a7e4720f 100644
> --- a/sysdeps/sparc/nptl/tcb-offsets.sym
> +++ b/sysdeps/sparc/nptl/tcb-offsets.sym
> @@ -1,6 +1,5 @@
>  #include <sysdep.h>
>  #include <tls.h>
>
> -MULTIPLE_THREADS_OFFSET                offsetof (tcbhead_t, multiple_threads)
>  POINTER_GUARD                  offsetof (tcbhead_t, pointer_guard)
>  TID                            offsetof (struct pthread, tid)
> diff --git a/sysdeps/sparc/nptl/tls.h b/sysdeps/sparc/nptl/tls.h
> index d1e2bb4ad1..b78cf0d6b4 100644
> --- a/sysdeps/sparc/nptl/tls.h
> +++ b/sysdeps/sparc/nptl/tls.h
> @@ -35,7 +35,7 @@ typedef struct
>                            thread descriptor used by libpthread.  */
>    dtv_t *dtv;
>    void *self;
> -  int multiple_threads;
> +  int unused_multiple_threads;
>  #if __WORDSIZE == 64
>    int gscope_flag;
>  #endif
> diff --git a/sysdeps/unix/sysv/linux/single-thread.h b/sysdeps/unix/sysv/linux/single-thread.h
> index 208edccce6..dd80e82c82 100644
> --- a/sysdeps/unix/sysv/linux/single-thread.h
> +++ b/sysdeps/unix/sysv/linux/single-thread.h
> @@ -23,20 +23,7 @@
>  # include <sys/single_threaded.h>
>  #endif
>
> -/* The default way to check if the process is single thread is by using the
> -   pthread_t 'multiple_threads' field.  However, for some architectures it is
> -   faster to either use an extra field on TCB or global variables (the TCB
> -   field is also used on x86 for some single-thread atomic optimizations).
> -
> -   The ABI might define SINGLE_THREAD_BY_GLOBAL to enable the single thread
> -   check to use global variables instead of the pthread_t field.  */
> -
> -#if !defined SINGLE_THREAD_BY_GLOBAL || IS_IN (rtld)
> -# define SINGLE_THREAD_P \
> -  (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
> -#else
> -# define SINGLE_THREAD_P (__libc_single_threaded != 0)
> -#endif
> +#define SINGLE_THREAD_P (__libc_single_threaded != 0)
>
>  #define RTLD_SINGLE_THREAD_P SINGLE_THREAD_P
>
> diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
> index f24f1c71ed..23e087e7e0 100644
> --- a/sysdeps/x86/atomic-machine.h
> +++ b/sysdeps/x86/atomic-machine.h
> @@ -51,292 +51,145 @@
>  #define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
>    (! __sync_bool_compare_and_swap (mem, oldval, newval))
>
> +#define __cmpxchg_op(lock, mem, newval, oldval)                                      \
> +  ({ __typeof (*mem) __ret;                                                  \
> +     if (sizeof (*mem) == 1)                                                 \
> +       asm volatile (lock "cmpxchgb %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : BR_CONSTRAINT (newval), "0" (oldval)                   \
> +                    : "memory");                                             \
> +     else if (sizeof (*mem) == 2)                                            \
> +       asm volatile (lock "cmpxchgw %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : BR_CONSTRAINT (newval), "0" (oldval)                   \
> +                    : "memory");                                             \
> +     else if (sizeof (*mem) == 4)                                            \
> +       asm volatile (lock "cmpxchgl %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : BR_CONSTRAINT (newval), "0" (oldval)                   \
> +                    : "memory");                                             \
> +     else if (__HAVE_64B_ATOMICS)                                            \
> +       asm volatile (lock "cmpxchgq %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : "q" ((int64_t) cast_to_integer (newval)),                      \
> +                      "0" ((int64_t) cast_to_integer (oldval))               \
> +                    : "memory");                                             \
> +     else                                                                    \
> +       __atomic_link_error ();                                               \
> +     __ret; })
>
> -#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
> +#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval)         \
>    ({ __typeof (*mem) ret;                                                    \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
> -                      "je 0f\n\t"                                            \
> -                      "lock\n"                                               \
> -                      "0:\tcmpxchgb %b2, %1"                                 \
> -                      : "=a" (ret), "=m" (*mem)                              \
> -                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
> +     if (SINGLE_THREAD_P)                                                    \
> +       __cmpxchg_op ("", (mem), (newval), (oldval));                         \
> +     else                                                                    \
> +       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));                \
>       ret; })
>
> -#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
> +#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval)        \
>    ({ __typeof (*mem) ret;                                                    \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
> -                      "je 0f\n\t"                                            \
> -                      "lock\n"                                               \
> -                      "0:\tcmpxchgw %w2, %1"                                 \
> -                      : "=a" (ret), "=m" (*mem)                              \
> -                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
> +     if (SINGLE_THREAD_P)                                                    \
> +       __cmpxchg_op ("", (mem), (newval), (oldval));                         \
> +     else                                                                    \
> +       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));                \
>       ret; })
>
> -#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
> +#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval)        \
>    ({ __typeof (*mem) ret;                                                    \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
> -                      "je 0f\n\t"                                            \
> -                      "lock\n"                                               \
> -                      "0:\tcmpxchgl %2, %1"                                  \
> -                      : "=a" (ret), "=m" (*mem)                              \
> -                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
> +     if (SINGLE_THREAD_P)                                                    \
> +       __cmpxchg_op ("", (mem), (newval), (oldval));                         \
> +     else                                                                    \
> +       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));                \
>       ret; })
>
> -#ifdef __x86_64__
> -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
> +#define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval)        \
>    ({ __typeof (*mem) ret;                                                    \
> -     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"                               \
> -                      "je 0f\n\t"                                            \
> -                      "lock\n"                                               \
> -                      "0:\tcmpxchgq %q2, %1"                                 \
> -                      : "=a" (ret), "=m" (*mem)                              \
> -                      : "q" ((int64_t) cast_to_integer (newval)),            \
> -                        "m" (*mem),                                          \
> -                        "0" ((int64_t) cast_to_integer (oldval)),            \
> -                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
> -     ret; })
> -# define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
> -# define do_add_val_64_acq(pfx, mem, value) do { } while (0)
> -#else
> -/* XXX We do not really need 64-bit compare-and-exchange.  At least
> -   not in the moment.  Using it would mean causing portability
> -   problems since not many other 32-bit architectures have support for
> -   such an operation.  So don't define any code for now.  If it is
> -   really going to be used the code below can be used on Intel Pentium
> -   and later, but NOT on i486.  */
> -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret = *(mem);                                           \
> -     __atomic_link_error ();                                                 \
> -     ret = (newval);                                                         \
> -     ret = (oldval);                                                         \
> -     ret; })
> -
> -# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)         \
> -  ({ __typeof (*mem) ret = *(mem);                                           \
> -     __atomic_link_error ();                                                 \
> -     ret = (newval);                                                         \
> -     ret = (oldval);                                                         \
> -     ret; })
> -
> -# define do_exchange_and_add_val_64_acq(pfx, mem, value) \
> -  ({ __typeof (value) __addval = (value);                                    \
> -     __typeof (*mem) __result;                                               \
> -     __typeof (mem) __memp = (mem);                                          \
> -     __typeof (*mem) __tmpval;                                               \
> -     __result = *__memp;                                                     \
> -     do                                                                              \
> -       __tmpval = __result;                                                  \
> -     while ((__result = pfx##_compare_and_exchange_val_64_acq                \
> -            (__memp, __result + __addval, __result)) == __tmpval);           \
> -     __result; })
> -
> -# define do_add_val_64_acq(pfx, mem, value) \
> -  {                                                                          \
> -    __typeof (value) __addval = (value);                                     \
> -    __typeof (mem) __memp = (mem);                                           \
> -    __typeof (*mem) __oldval = *__memp;                                              \
> -    __typeof (*mem) __tmpval;                                                \
> -    do                                                                       \
> -      __tmpval = __oldval;                                                   \
> -    while ((__oldval = pfx##_compare_and_exchange_val_64_acq                 \
> -           (__memp, __oldval + __addval, __oldval)) == __tmpval);            \
> -  }
> -#endif
> -
> -
> -/* Note that we need no lock prefix.  */
> -#define atomic_exchange_acq(mem, newvalue) \
> -  ({ __typeof (*mem) result;                                                 \
> -     if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile ("xchgb %b0, %1"                                     \
> -                        : "=q" (result), "=m" (*mem)                         \
> -                        : "0" (newvalue), "m" (*mem));                       \
> -     else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile ("xchgw %w0, %1"                                     \
> -                        : "=r" (result), "=m" (*mem)                         \
> -                        : "0" (newvalue), "m" (*mem));                       \
> -     else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile ("xchgl %0, %1"                                              \
> -                        : "=r" (result), "=m" (*mem)                         \
> -                        : "0" (newvalue), "m" (*mem));                       \
> -     else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile ("xchgq %q0, %1"                                     \
> -                        : "=r" (result), "=m" (*mem)                         \
> -                        : "0" ((int64_t) cast_to_integer (newvalue)),        \
> -                          "m" (*mem));                                       \
> -     else                                                                    \
> -       {                                                                     \
> -        result = 0;                                                          \
> -        __atomic_link_error ();                                              \
> -       }                                                                     \
> -     result; })
> -
> -
> -#define __arch_exchange_and_add_body(lock, pfx, mem, value) \
> -  ({ __typeof (*mem) __result;                                               \
> -     __typeof (value) __addval = (value);                                    \
> -     if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile (lock "xaddb %b0, %1"                                \
> -                        : "=q" (__result), "=m" (*mem)                       \
> -                        : "0" (__addval), "m" (*mem),                        \
> -                          "i" (offsetof (tcbhead_t, multiple_threads)));     \
> -     else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile (lock "xaddw %w0, %1"                                \
> -                        : "=r" (__result), "=m" (*mem)                       \
> -                        : "0" (__addval), "m" (*mem),                        \
> -                          "i" (offsetof (tcbhead_t, multiple_threads)));     \
> -     else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile (lock "xaddl %0, %1"                                 \
> -                        : "=r" (__result), "=m" (*mem)                       \
> -                        : "0" (__addval), "m" (*mem),                        \
> -                          "i" (offsetof (tcbhead_t, multiple_threads)));     \
> -     else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile (lock "xaddq %q0, %1"                                \
> -                        : "=r" (__result), "=m" (*mem)                       \
> -                        : "0" ((int64_t) cast_to_integer (__addval)),     \
> -                          "m" (*mem),                                        \
> -                          "i" (offsetof (tcbhead_t, multiple_threads)));     \
> +     if (SINGLE_THREAD_P)                                                    \
> +       __cmpxchg_op ("", (mem), (newval), (oldval));                         \
>       else                                                                    \
> -       __result = do_exchange_and_add_val_64_acq (pfx, (mem), __addval);      \
> -     __result; })
> -
> -#define atomic_exchange_and_add(mem, value) \
> -  __sync_fetch_and_add (mem, value)
> -
> -#define __arch_exchange_and_add_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_exchange_and_add(mem, value) \
> -  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
> -                               mem, value)
> -
> -
> -#define __arch_add_body(lock, pfx, apfx, mem, value) \
> -  do {                                                                       \
> -    if (__builtin_constant_p (value) && (value) == 1)                        \
> -      pfx##_increment (mem);                                                 \
> -    else if (__builtin_constant_p (value) && (value) == -1)                  \
> -      pfx##_decrement (mem);                                                 \
> -    else if (sizeof (*mem) == 1)                                             \
> -      __asm __volatile (lock "addb %b1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : IBR_CONSTRAINT (value), "m" (*mem),                 \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "addw %w1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (value), "m" (*mem),                           \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "addl %1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (value), "m" (*mem),                           \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "addq %q1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" ((int64_t) cast_to_integer (value)),           \
> -                         "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      do_add_val_64_acq (apfx, (mem), (value));                                      \
> -  } while (0)
> -
> -# define atomic_add(mem, value) \
> -  __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
> -
> -#define __arch_add_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_add(mem, value) \
> -  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
> +       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));                \
> +     ret; })
>
>
> -#define atomic_add_negative(mem, value) \
> -  ({ unsigned char __result;                                                 \
> +#define __xchg_op(lock, mem, arg, op)                                        \
> +  ({ __typeof (*mem) __ret = (arg);                                          \
>       if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : IBR_CONSTRAINT (value), "m" (*mem));               \
> +       __asm __volatile (lock #op "b %b0, %1"                                \
> +                        : "=q" (__ret), "=m" (*mem)                          \
> +                        : "0" (arg), "m" (*mem)                              \
> +                        : "memory", "cc");                                   \
>       else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" (value), "m" (*mem));                         \
> +       __asm __volatile (lock #op "w %w0, %1"                                \
> +                        : "=r" (__ret), "=m" (*mem)                          \
> +                        : "0" (arg), "m" (*mem)                              \
> +                        : "memory", "cc");                                   \
>       else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1"                  \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" (value), "m" (*mem));                         \
> +       __asm __volatile (lock #op "l %0, %1"                                 \
> +                        : "=r" (__ret), "=m" (*mem)                          \
> +                        : "0" (arg), "m" (*mem)                              \
> +                        : "memory", "cc");                                   \
>       else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile (LOCK_PREFIX "addq %q2, %0; sets %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" ((int64_t) cast_to_integer (value)),          \
> -                          "m" (*mem));                                       \
> +       __asm __volatile (lock #op "q %q0, %1"                                \
> +                        : "=r" (__ret), "=m" (*mem)                          \
> +                        : "0" ((int64_t) cast_to_integer (arg)),             \
> +                          "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else                                                                    \
>         __atomic_link_error ();                                               \
> -     __result; })
> -
> +     __ret; })
>
> -#define atomic_add_zero(mem, value) \
> -  ({ unsigned char __result;                                                 \
> +#define __single_op(lock, mem, op)                                           \
> +  ({                                                                         \
>       if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : IBR_CONSTRAINT (value), "m" (*mem));               \
> +       __asm __volatile (lock #op "b %b0"                                    \
> +                        : "=m" (*mem)                                        \
> +                        : "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" (value), "m" (*mem));                         \
> +       __asm __volatile (lock #op "w %b0"                                    \
> +                        : "=m" (*mem)                                        \
> +                        : "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1"                  \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" (value), "m" (*mem));                         \
> +       __asm __volatile (lock #op "l %b0"                                    \
> +                        : "=m" (*mem)                                        \
> +                        : "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile (LOCK_PREFIX "addq %q2, %0; setz %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" ((int64_t) cast_to_integer (value)),          \
> -                          "m" (*mem));                                       \
> +       __asm __volatile (lock #op "q %b0"                                    \
> +                        : "=m" (*mem)                                        \
> +                        : "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else                                                                    \
> -       __atomic_link_error ();                                       \
> -     __result; })
> +       __atomic_link_error ();                                               \
> +  })
>
> +/* Note that we need no lock prefix.  */
> +#define atomic_exchange_acq(mem, newvalue)                                   \
> +  __xchg_op ("", (mem), (newvalue), xchg)
>
> -#define __arch_increment_body(lock, pfx, mem) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (lock "incb %b0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "incw %w0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "incl %0"                                       \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "incq %q0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      do_add_val_64_acq (pfx, mem, 1);                                       \
> -  } while (0)
> +#define atomic_add(mem, value) \
> +  __xchg_op (LOCK_PREFIX, (mem), (value), add);                                      \
>
> -#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
> +#define catomic_add(mem, value)                                                      \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __xchg_op ("", (mem), (value), add);                                   \
> +   else                                                                              \
> +     atomic_add (mem, value);                                                \
> +  })
>
> -#define __arch_increment_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
>
> -#define catomic_increment(mem) \
> -  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
> +#define atomic_increment(mem) \
> +  __single_op (LOCK_PREFIX, (mem), inc)
>
> +#define catomic_increment(mem)                                               \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __single_op ("", (mem), inc);                                          \
> +   else                                                                              \
> +     atomic_increment (mem);                                                 \
> +  })
>
>  #define atomic_increment_and_test(mem) \
>    ({ unsigned char __result;                                                 \
> @@ -357,43 +210,20 @@
>                          : "=m" (*mem), "=qm" (__result)                      \
>                          : "m" (*mem));                                       \
>       else                                                                    \
> -       __atomic_link_error ();                                       \
> +       __atomic_link_error ();                                               \
>       __result; })
>
>
> -#define __arch_decrement_body(lock, pfx, mem) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (lock "decb %b0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "decw %w0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "decl %0"                                       \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "decq %q0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      do_add_val_64_acq (pfx, mem, -1);                                              \
> -  } while (0)
> -
> -#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
> +#define atomic_decrement(mem)                                                \
> +  __single_op (LOCK_PREFIX, (mem), dec)
>
> -#define __arch_decrement_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_decrement(mem) \
> -  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
> +#define catomic_decrement(mem)                                               \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __single_op ("", (mem), dec);                                          \
> +   else                                                                              \
> +     atomic_decrement (mem);                                                 \
> +  })
>
>
>  #define atomic_decrement_and_test(mem) \
> @@ -463,73 +293,31 @@
>                          : "=q" (__result), "=m" (*mem)                       \
>                          : "m" (*mem), "ir" (bit));                           \
>       else                                                                    \
> -       __atomic_link_error ();                                       \
> +       __atomic_link_error ();                                               \
>       __result; })
>
>
> -#define __arch_and_body(lock, mem, mask) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (lock "andb %b1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : IBR_CONSTRAINT (mask), "m" (*mem),                  \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "andw %w1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "andl %1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "andq %q1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      __atomic_link_error ();                                                \
> -  } while (0)
> -
> -#define __arch_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
> -
> -#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
> -
> -#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
> +#define atomic_and(mem, mask)                                                \
> +  __xchg_op (LOCK_PREFIX, (mem), (mask), and)
>
> +#define catomic_and(mem, mask) \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __xchg_op ("", (mem), (mask), and);                                    \
> +   else                                                                              \
> +      atomic_and (mem, mask);                                                \
> +  })
>
> -#define __arch_or_body(lock, mem, mask) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (lock "orb %b1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : IBR_CONSTRAINT (mask), "m" (*mem),                  \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "orw %w1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "orl %1, %0"                                    \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "orq %q1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      __atomic_link_error ();                                                \
> -  } while (0)
> -
> -#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
> +#define atomic_or(mem, mask)                                                 \
> +  __xchg_op (LOCK_PREFIX, (mem), (mask), or)
>
> -#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
> +#define catomic_or(mem, mask) \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __xchg_op ("", (mem), (mask), or);                                     \
> +   else                                                                              \
> +      atomic_or (mem, mask);                                                 \
> +  })

I believe the motivation for this approach on x86 was that PIC access
was quite expensive.  Although x86-64 has PC-relative access, do
we access the local copy of SINGLE_THREAD_P directly or the global
one via GOT?

>  /* We don't use mfence because it is supposedly slower due to having to
>     provide stronger guarantees (e.g., regarding self-modifying code).  */
> diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
> index 2bbd563a6c..8ec55a7ea8 100644
> --- a/sysdeps/x86_64/nptl/tcb-offsets.sym
> +++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
> @@ -9,7 +9,6 @@ CLEANUP_JMP_BUF         offsetof (struct pthread, cleanup_jmp_buf)
>  CLEANUP                        offsetof (struct pthread, cleanup)
>  CLEANUP_PREV           offsetof (struct _pthread_cleanup_buffer, __prev)
>  MUTEX_FUTEX            offsetof (pthread_mutex_t, __data.__lock)
> -MULTIPLE_THREADS_OFFSET        offsetof (tcbhead_t, multiple_threads)
>  POINTER_GUARD          offsetof (tcbhead_t, pointer_guard)
>  FEATURE_1_OFFSET       offsetof (tcbhead_t, feature_1)
>  SSP_BASE_OFFSET                offsetof (tcbhead_t, ssp_base)
> --
> 2.34.1
>
Noah Goldstein June 10, 2022, 9 p.m. UTC | #2
On Fri, Jun 10, 2022 at 9:39 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Instead use __libc_single_threaded on all architectures.  The TCB
> field is renamed to avoid change the struct layout.
>
> The x86 atomic need some adjustments since it has single-thread
> optimizationi builtin within the inline assemblye.  It now uses
> SINGLE_THREAD_P and atomic optimizations are removed (since they
> are not used).
>
> Checked on x86_64-linux-gnu and i686-linux-gnu.
> ---
>  misc/tst-atomic.c                       |   1 +
>  nptl/allocatestack.c                    |   6 -
>  nptl/descr.h                            |  17 +-
>  nptl/pthread_cancel.c                   |   7 +-
>  nptl/pthread_create.c                   |   5 -
>  sysdeps/i386/htl/tcb-offsets.sym        |   1 -
>  sysdeps/i386/nptl/tcb-offsets.sym       |   1 -
>  sysdeps/i386/nptl/tls.h                 |   4 +-
>  sysdeps/ia64/nptl/tcb-offsets.sym       |   1 -
>  sysdeps/ia64/nptl/tls.h                 |   2 -
>  sysdeps/mach/hurd/i386/tls.h            |   4 +-
>  sysdeps/nios2/nptl/tcb-offsets.sym      |   1 -
>  sysdeps/or1k/nptl/tls.h                 |   2 -
>  sysdeps/powerpc/nptl/tcb-offsets.sym    |   3 -
>  sysdeps/powerpc/nptl/tls.h              |   3 -
>  sysdeps/s390/nptl/tcb-offsets.sym       |   1 -
>  sysdeps/s390/nptl/tls.h                 |   6 +-
>  sysdeps/sh/nptl/tcb-offsets.sym         |   1 -
>  sysdeps/sh/nptl/tls.h                   |   2 -
>  sysdeps/sparc/nptl/tcb-offsets.sym      |   1 -
>  sysdeps/sparc/nptl/tls.h                |   2 +-
>  sysdeps/unix/sysv/linux/single-thread.h |  15 +-
>  sysdeps/x86/atomic-machine.h            | 484 +++++++-----------------
>  sysdeps/x86_64/nptl/tcb-offsets.sym     |   1 -
>  24 files changed, 145 insertions(+), 426 deletions(-)
>
> diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
> index 6d681a7bfd..ddbc618e25 100644
> --- a/misc/tst-atomic.c
> +++ b/misc/tst-atomic.c
> @@ -18,6 +18,7 @@
>
>  #include <stdio.h>
>  #include <atomic.h>
> +#include <support/xthread.h>
>
>  #ifndef atomic_t
>  # define atomic_t int
> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index 98f5f6dd85..3e0d01cb52 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -290,9 +290,6 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>          stack cache nor will the memory (except the TLS memory) be freed.  */
>        pd->user_stack = true;
>
> -      /* This is at least the second thread.  */
> -      pd->header.multiple_threads = 1;
> -
>  #ifdef NEED_DL_SYSINFO
>        SETUP_THREAD_SYSINFO (pd);
>  #endif
> @@ -408,9 +405,6 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>              descriptor.  */
>           pd->specific[0] = pd->specific_1stblock;
>
> -         /* This is at least the second thread.  */
> -         pd->header.multiple_threads = 1;
> -
>  #ifdef NEED_DL_SYSINFO
>           SETUP_THREAD_SYSINFO (pd);
>  #endif
> diff --git a/nptl/descr.h b/nptl/descr.h
> index bb46b5958e..77b25d8267 100644
> --- a/nptl/descr.h
> +++ b/nptl/descr.h
> @@ -137,22 +137,7 @@ struct pthread
>  #else
>      struct
>      {
> -      /* multiple_threads is enabled either when the process has spawned at
> -        least one thread or when a single-threaded process cancels itself.
> -        This enables additional code to introduce locking before doing some
> -        compare_and_exchange operations and also enable cancellation points.
> -        The concepts of multiple threads and cancellation points ideally
> -        should be separate, since it is not necessary for multiple threads to
> -        have been created for cancellation points to be enabled, as is the
> -        case is when single-threaded process cancels itself.
> -
> -        Since enabling multiple_threads enables additional code in
> -        cancellation points and compare_and_exchange operations, there is a
> -        potential for an unneeded performance hit when it is enabled in a
> -        single-threaded, self-canceling process.  This is OK though, since a
> -        single-threaded process will enable async cancellation only when it
> -        looks to cancel itself and is hence going to end anyway.  */
> -      int multiple_threads;
> +      int unused_multiple_threads;
>        int gscope_flag;
>      } header;
>  #endif
> diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
> index e1735279f2..6d26a15d0e 100644
> --- a/nptl/pthread_cancel.c
> +++ b/nptl/pthread_cancel.c
> @@ -157,12 +157,9 @@ __pthread_cancel (pthread_t th)
>
>         /* A single-threaded process should be able to kill itself, since
>            there is nothing in the POSIX specification that says that it
> -          cannot.  So we set multiple_threads to true so that cancellation
> -          points get executed.  */
> -       THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
> -#ifndef TLS_MULTIPLE_THREADS_IN_TCB
> +          cannot.  So we set __libc_single_threaded to true so that
> +          cancellation points get executed.  */
>         __libc_single_threaded = 0;
> -#endif
>      }
>    while (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling, &oldval,
>                                                 newval));
> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 5633d01c62..d43865352f 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -882,11 +882,6 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
>            other reason that create_thread chose.  Now let it run
>            free.  */
>         lll_unlock (pd->lock, LLL_PRIVATE);
> -
> -      /* We now have for sure more than one thread.  The main thread might
> -        not yet have the flag set.  No need to set the global variable
> -        again if this is what we use.  */
> -      THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
>      }
>
>   out:
> diff --git a/sysdeps/i386/htl/tcb-offsets.sym b/sysdeps/i386/htl/tcb-offsets.sym
> index 7b7c719369..f3f7df6c06 100644
> --- a/sysdeps/i386/htl/tcb-offsets.sym
> +++ b/sysdeps/i386/htl/tcb-offsets.sym
> @@ -2,7 +2,6 @@
>  #include <tls.h>
>  #include <kernel-features.h>
>
> -MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads)
>  SYSINFO_OFFSET          offsetof (tcbhead_t, sysinfo)
>  POINTER_GUARD           offsetof (tcbhead_t, pointer_guard)
>  SIGSTATE_OFFSET         offsetof (tcbhead_t, _hurd_sigstate)
> diff --git a/sysdeps/i386/nptl/tcb-offsets.sym b/sysdeps/i386/nptl/tcb-offsets.sym
> index 2ec9e787c1..1efd1469d8 100644
> --- a/sysdeps/i386/nptl/tcb-offsets.sym
> +++ b/sysdeps/i386/nptl/tcb-offsets.sym
> @@ -6,7 +6,6 @@ RESULT                  offsetof (struct pthread, result)
>  TID                    offsetof (struct pthread, tid)
>  CANCELHANDLING         offsetof (struct pthread, cancelhandling)
>  CLEANUP_JMP_BUF                offsetof (struct pthread, cleanup_jmp_buf)
> -MULTIPLE_THREADS_OFFSET        offsetof (tcbhead_t, multiple_threads)
>  SYSINFO_OFFSET         offsetof (tcbhead_t, sysinfo)
>  CLEANUP                        offsetof (struct pthread, cleanup)
>  CLEANUP_PREV           offsetof (struct _pthread_cleanup_buffer, __prev)
> diff --git a/sysdeps/i386/nptl/tls.h b/sysdeps/i386/nptl/tls.h
> index 91090bf287..48940a9f44 100644
> --- a/sysdeps/i386/nptl/tls.h
> +++ b/sysdeps/i386/nptl/tls.h
> @@ -36,7 +36,7 @@ typedef struct
>                            thread descriptor used by libpthread.  */
>    dtv_t *dtv;
>    void *self;          /* Pointer to the thread descriptor.  */
> -  int multiple_threads;
> +  int unused_multiple_threads;
>    uintptr_t sysinfo;
>    uintptr_t stack_guard;
>    uintptr_t pointer_guard;
> @@ -57,8 +57,6 @@ typedef struct
>  _Static_assert (offsetof (tcbhead_t, __private_ss) == 0x30,
>                 "offset of __private_ss != 0x30");
>
> -# define TLS_MULTIPLE_THREADS_IN_TCB 1
> -
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>
>  #endif
> diff --git a/sysdeps/ia64/nptl/tcb-offsets.sym b/sysdeps/ia64/nptl/tcb-offsets.sym
> index b01f712be2..ab2cb180f9 100644
> --- a/sysdeps/ia64/nptl/tcb-offsets.sym
> +++ b/sysdeps/ia64/nptl/tcb-offsets.sym
> @@ -2,5 +2,4 @@
>  #include <tls.h>
>
>  TID                    offsetof (struct pthread, tid) - TLS_PRE_TCB_SIZE
> -MULTIPLE_THREADS_OFFSET offsetof (struct pthread, header.multiple_threads) - TLS_PRE_TCB_SIZE
>  SYSINFO_OFFSET         offsetof (tcbhead_t, __private)
> diff --git a/sysdeps/ia64/nptl/tls.h b/sysdeps/ia64/nptl/tls.h
> index 8ccedb73e6..008e080fc4 100644
> --- a/sysdeps/ia64/nptl/tls.h
> +++ b/sysdeps/ia64/nptl/tls.h
> @@ -36,8 +36,6 @@ typedef struct
>
>  register struct pthread *__thread_self __asm__("r13");
>
> -# define TLS_MULTIPLE_THREADS_IN_TCB 1
> -
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>
>  #endif
> diff --git a/sysdeps/mach/hurd/i386/tls.h b/sysdeps/mach/hurd/i386/tls.h
> index 264ed9a9c5..d33e91c922 100644
> --- a/sysdeps/mach/hurd/i386/tls.h
> +++ b/sysdeps/mach/hurd/i386/tls.h
> @@ -33,7 +33,7 @@ typedef struct
>    void *tcb;                   /* Points to this structure.  */
>    dtv_t *dtv;                  /* Vector of pointers to TLS data.  */
>    thread_t self;               /* This thread's control port.  */
> -  int multiple_threads;
> +  int unused_multiple_threads;
>    uintptr_t sysinfo;
>    uintptr_t stack_guard;
>    uintptr_t pointer_guard;
> @@ -117,8 +117,6 @@ _hurd_tls_init (tcbhead_t *tcb)
>    /* This field is used by TLS accesses to get our "thread pointer"
>       from the TLS point of view.  */
>    tcb->tcb = tcb;
> -  /* We always at least start the sigthread anyway.  */
> -  tcb->multiple_threads = 1;
>
>    /* Get the first available selector.  */
>    int sel = -1;
> diff --git a/sysdeps/nios2/nptl/tcb-offsets.sym b/sysdeps/nios2/nptl/tcb-offsets.sym
> index 3cd8d984ac..93a695ac7f 100644
> --- a/sysdeps/nios2/nptl/tcb-offsets.sym
> +++ b/sysdeps/nios2/nptl/tcb-offsets.sym
> @@ -8,6 +8,5 @@
>  # define __thread_self          ((void *) 0)
>  # define thread_offsetof(mem)   ((ptrdiff_t) THREAD_SELF + offsetof (struct pthread, mem))
>
> -MULTIPLE_THREADS_OFFSET                thread_offsetof (header.multiple_threads)
>  TID_OFFSET                     thread_offsetof (tid)
>  POINTER_GUARD                  (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
> diff --git a/sysdeps/or1k/nptl/tls.h b/sysdeps/or1k/nptl/tls.h
> index c6ffe62c3f..3bb07beef8 100644
> --- a/sysdeps/or1k/nptl/tls.h
> +++ b/sysdeps/or1k/nptl/tls.h
> @@ -35,8 +35,6 @@ typedef struct
>
>  register tcbhead_t *__thread_self __asm__("r10");
>
> -# define TLS_MULTIPLE_THREADS_IN_TCB 1
> -
>  /* Get system call information.  */
>  # include <sysdep.h>
>
> diff --git a/sysdeps/powerpc/nptl/tcb-offsets.sym b/sysdeps/powerpc/nptl/tcb-offsets.sym
> index 4c01615ad0..a0ee95f94d 100644
> --- a/sysdeps/powerpc/nptl/tcb-offsets.sym
> +++ b/sysdeps/powerpc/nptl/tcb-offsets.sym
> @@ -10,9 +10,6 @@
>  # define thread_offsetof(mem)  ((ptrdiff_t) THREAD_SELF + offsetof (struct pthread, mem))
>
>
> -#if TLS_MULTIPLE_THREADS_IN_TCB
> -MULTIPLE_THREADS_OFFSET                thread_offsetof (header.multiple_threads)
> -#endif
>  TID                            thread_offsetof (tid)
>  POINTER_GUARD                  (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
>  TAR_SAVE                       (offsetof (tcbhead_t, tar_save) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
> diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
> index 22b0075235..fd5ee51981 100644
> --- a/sysdeps/powerpc/nptl/tls.h
> +++ b/sysdeps/powerpc/nptl/tls.h
> @@ -52,9 +52,6 @@
>  # define TLS_DTV_AT_TP 1
>  # define TLS_TCB_AT_TP 0
>
> -/* We use the multiple_threads field in the pthread struct */
> -#define TLS_MULTIPLE_THREADS_IN_TCB    1
> -
>  /* Get the thread descriptor definition.  */
>  # include <nptl/descr.h>
>
> diff --git a/sysdeps/s390/nptl/tcb-offsets.sym b/sysdeps/s390/nptl/tcb-offsets.sym
> index 9c1c01f353..bc7b267463 100644
> --- a/sysdeps/s390/nptl/tcb-offsets.sym
> +++ b/sysdeps/s390/nptl/tcb-offsets.sym
> @@ -1,6 +1,5 @@
>  #include <sysdep.h>
>  #include <tls.h>
>
> -MULTIPLE_THREADS_OFFSET                offsetof (tcbhead_t, multiple_threads)
>  STACK_GUARD                    offsetof (tcbhead_t, stack_guard)
>  TID                            offsetof (struct pthread, tid)
> diff --git a/sysdeps/s390/nptl/tls.h b/sysdeps/s390/nptl/tls.h
> index ff210ffeb2..d69ed539f7 100644
> --- a/sysdeps/s390/nptl/tls.h
> +++ b/sysdeps/s390/nptl/tls.h
> @@ -35,7 +35,7 @@ typedef struct
>                            thread descriptor used by libpthread.  */
>    dtv_t *dtv;
>    void *self;          /* Pointer to the thread descriptor.  */
> -  int multiple_threads;
> +  int unused_multiple_threads;
>    uintptr_t sysinfo;
>    uintptr_t stack_guard;
>    int gscope_flag;
> @@ -44,10 +44,6 @@ typedef struct
>    void *__private_ss;
>  } tcbhead_t;
>
> -# ifndef __s390x__
> -#  define TLS_MULTIPLE_THREADS_IN_TCB 1
> -# endif
> -
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>
>  #endif
> diff --git a/sysdeps/sh/nptl/tcb-offsets.sym b/sysdeps/sh/nptl/tcb-offsets.sym
> index 234207779d..4e452d9c6c 100644
> --- a/sysdeps/sh/nptl/tcb-offsets.sym
> +++ b/sysdeps/sh/nptl/tcb-offsets.sym
> @@ -6,7 +6,6 @@ RESULT                  offsetof (struct pthread, result)
>  TID                    offsetof (struct pthread, tid)
>  CANCELHANDLING         offsetof (struct pthread, cancelhandling)
>  CLEANUP_JMP_BUF                offsetof (struct pthread, cleanup_jmp_buf)
> -MULTIPLE_THREADS_OFFSET        offsetof (struct pthread, header.multiple_threads)
>  TLS_PRE_TCB_SIZE       sizeof (struct pthread)
>  MUTEX_FUTEX            offsetof (pthread_mutex_t, __data.__lock)
>  POINTER_GUARD          offsetof (tcbhead_t, pointer_guard)
> diff --git a/sysdeps/sh/nptl/tls.h b/sysdeps/sh/nptl/tls.h
> index 76591ab6ef..8778cb4ac0 100644
> --- a/sysdeps/sh/nptl/tls.h
> +++ b/sysdeps/sh/nptl/tls.h
> @@ -36,8 +36,6 @@ typedef struct
>    uintptr_t pointer_guard;
>  } tcbhead_t;
>
> -# define TLS_MULTIPLE_THREADS_IN_TCB 1
> -
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>
>  #endif /* __ASSEMBLER__ */
> diff --git a/sysdeps/sparc/nptl/tcb-offsets.sym b/sysdeps/sparc/nptl/tcb-offsets.sym
> index f75d02065e..e4a7e4720f 100644
> --- a/sysdeps/sparc/nptl/tcb-offsets.sym
> +++ b/sysdeps/sparc/nptl/tcb-offsets.sym
> @@ -1,6 +1,5 @@
>  #include <sysdep.h>
>  #include <tls.h>
>
> -MULTIPLE_THREADS_OFFSET                offsetof (tcbhead_t, multiple_threads)
>  POINTER_GUARD                  offsetof (tcbhead_t, pointer_guard)
>  TID                            offsetof (struct pthread, tid)
> diff --git a/sysdeps/sparc/nptl/tls.h b/sysdeps/sparc/nptl/tls.h
> index d1e2bb4ad1..b78cf0d6b4 100644
> --- a/sysdeps/sparc/nptl/tls.h
> +++ b/sysdeps/sparc/nptl/tls.h
> @@ -35,7 +35,7 @@ typedef struct
>                            thread descriptor used by libpthread.  */
>    dtv_t *dtv;
>    void *self;
> -  int multiple_threads;
> +  int unused_multiple_threads;
>  #if __WORDSIZE == 64
>    int gscope_flag;
>  #endif
> diff --git a/sysdeps/unix/sysv/linux/single-thread.h b/sysdeps/unix/sysv/linux/single-thread.h
> index 208edccce6..dd80e82c82 100644
> --- a/sysdeps/unix/sysv/linux/single-thread.h
> +++ b/sysdeps/unix/sysv/linux/single-thread.h
> @@ -23,20 +23,7 @@
>  # include <sys/single_threaded.h>
>  #endif
>
> -/* The default way to check if the process is single thread is by using the
> -   pthread_t 'multiple_threads' field.  However, for some architectures it is
> -   faster to either use an extra field on TCB or global variables (the TCB
> -   field is also used on x86 for some single-thread atomic optimizations).
> -
> -   The ABI might define SINGLE_THREAD_BY_GLOBAL to enable the single thread
> -   check to use global variables instead of the pthread_t field.  */
> -
> -#if !defined SINGLE_THREAD_BY_GLOBAL || IS_IN (rtld)
> -# define SINGLE_THREAD_P \
> -  (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
> -#else
> -# define SINGLE_THREAD_P (__libc_single_threaded != 0)
> -#endif
> +#define SINGLE_THREAD_P (__libc_single_threaded != 0)
>
>  #define RTLD_SINGLE_THREAD_P SINGLE_THREAD_P
>
> diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
> index f24f1c71ed..23e087e7e0 100644
> --- a/sysdeps/x86/atomic-machine.h
> +++ b/sysdeps/x86/atomic-machine.h
> @@ -51,292 +51,145 @@
>  #define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
>    (! __sync_bool_compare_and_swap (mem, oldval, newval))
>
> +#define __cmpxchg_op(lock, mem, newval, oldval)                                      \
> +  ({ __typeof (*mem) __ret;                                                  \
> +     if (sizeof (*mem) == 1)                                                 \
> +       asm volatile (lock "cmpxchgb %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : BR_CONSTRAINT (newval), "0" (oldval)                   \
> +                    : "memory");                                             \

Is the full "memory" clobber needed? Shouldn't the "+m"(*mem) be enough?
> +     else if (sizeof (*mem) == 2)                                            \
> +       asm volatile (lock "cmpxchgw %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : BR_CONSTRAINT (newval), "0" (oldval)                   \
> +                    : "memory");                                             \
> +     else if (sizeof (*mem) == 4)                                            \
> +       asm volatile (lock "cmpxchgl %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : BR_CONSTRAINT (newval), "0" (oldval)                   \
> +                    : "memory");                                             \
> +     else if (__HAVE_64B_ATOMICS)                                            \
> +       asm volatile (lock "cmpxchgq %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : "q" ((int64_t) cast_to_integer (newval)),                      \
> +                      "0" ((int64_t) cast_to_integer (oldval))               \
> +                    : "memory");                                             \
> +     else                                                                    \
> +       __atomic_link_error ();                                               \
> +     __ret; })
>
> -#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
> +#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval)         \
>    ({ __typeof (*mem) ret;                                                    \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
> -                      "je 0f\n\t"                                            \
> -                      "lock\n"                                               \
> -                      "0:\tcmpxchgb %b2, %1"                                 \
> -                      : "=a" (ret), "=m" (*mem)                              \
> -                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
> +     if (SINGLE_THREAD_P)                                                    \
> +       __cmpxchg_op ("", (mem), (newval), (oldval));                         \
> +     else                                                                    \
> +       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));                \
>       ret; })
>
> -#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
> +#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval)        \
>    ({ __typeof (*mem) ret;                                                    \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
> -                      "je 0f\n\t"                                            \
> -                      "lock\n"                                               \
> -                      "0:\tcmpxchgw %w2, %1"                                 \
> -                      : "=a" (ret), "=m" (*mem)                              \
> -                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
> +     if (SINGLE_THREAD_P)                                                    \
> +       __cmpxchg_op ("", (mem), (newval), (oldval));                         \
> +     else                                                                    \
> +       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));                \
>       ret; })
>
> -#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
> +#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval)        \
>    ({ __typeof (*mem) ret;                                                    \
> -     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"                              \
> -                      "je 0f\n\t"                                            \
> -                      "lock\n"                                               \
> -                      "0:\tcmpxchgl %2, %1"                                  \
> -                      : "=a" (ret), "=m" (*mem)                              \
> -                      : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
> -                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
> +     if (SINGLE_THREAD_P)                                                    \
> +       __cmpxchg_op ("", (mem), (newval), (oldval));                         \
> +     else                                                                    \
> +       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));                \
>       ret; })
>
> -#ifdef __x86_64__
> -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
> +#define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval)        \
>    ({ __typeof (*mem) ret;                                                    \
> -     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"                               \
> -                      "je 0f\n\t"                                            \
> -                      "lock\n"                                               \
> -                      "0:\tcmpxchgq %q2, %1"                                 \
> -                      : "=a" (ret), "=m" (*mem)                              \
> -                      : "q" ((int64_t) cast_to_integer (newval)),            \
> -                        "m" (*mem),                                          \
> -                        "0" ((int64_t) cast_to_integer (oldval)),            \
> -                        "i" (offsetof (tcbhead_t, multiple_threads)));       \
> -     ret; })
> -# define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
> -# define do_add_val_64_acq(pfx, mem, value) do { } while (0)
> -#else
> -/* XXX We do not really need 64-bit compare-and-exchange.  At least
> -   not in the moment.  Using it would mean causing portability
> -   problems since not many other 32-bit architectures have support for
> -   such an operation.  So don't define any code for now.  If it is
> -   really going to be used the code below can be used on Intel Pentium
> -   and later, but NOT on i486.  */
> -# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
> -  ({ __typeof (*mem) ret = *(mem);                                           \
> -     __atomic_link_error ();                                                 \
> -     ret = (newval);                                                         \
> -     ret = (oldval);                                                         \
> -     ret; })
> -
> -# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)         \
> -  ({ __typeof (*mem) ret = *(mem);                                           \
> -     __atomic_link_error ();                                                 \
> -     ret = (newval);                                                         \
> -     ret = (oldval);                                                         \
> -     ret; })
> -
> -# define do_exchange_and_add_val_64_acq(pfx, mem, value) \
> -  ({ __typeof (value) __addval = (value);                                    \
> -     __typeof (*mem) __result;                                               \
> -     __typeof (mem) __memp = (mem);                                          \
> -     __typeof (*mem) __tmpval;                                               \
> -     __result = *__memp;                                                     \
> -     do                                                                              \
> -       __tmpval = __result;                                                  \
> -     while ((__result = pfx##_compare_and_exchange_val_64_acq                \
> -            (__memp, __result + __addval, __result)) == __tmpval);           \
> -     __result; })
> -
> -# define do_add_val_64_acq(pfx, mem, value) \
> -  {                                                                          \
> -    __typeof (value) __addval = (value);                                     \
> -    __typeof (mem) __memp = (mem);                                           \
> -    __typeof (*mem) __oldval = *__memp;                                              \
> -    __typeof (*mem) __tmpval;                                                \
> -    do                                                                       \
> -      __tmpval = __oldval;                                                   \
> -    while ((__oldval = pfx##_compare_and_exchange_val_64_acq                 \
> -           (__memp, __oldval + __addval, __oldval)) == __tmpval);            \
> -  }
> -#endif
> -
> -
> -/* Note that we need no lock prefix.  */
> -#define atomic_exchange_acq(mem, newvalue) \
> -  ({ __typeof (*mem) result;                                                 \
> -     if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile ("xchgb %b0, %1"                                     \
> -                        : "=q" (result), "=m" (*mem)                         \
> -                        : "0" (newvalue), "m" (*mem));                       \
> -     else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile ("xchgw %w0, %1"                                     \
> -                        : "=r" (result), "=m" (*mem)                         \
> -                        : "0" (newvalue), "m" (*mem));                       \
> -     else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile ("xchgl %0, %1"                                              \
> -                        : "=r" (result), "=m" (*mem)                         \
> -                        : "0" (newvalue), "m" (*mem));                       \
> -     else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile ("xchgq %q0, %1"                                     \
> -                        : "=r" (result), "=m" (*mem)                         \
> -                        : "0" ((int64_t) cast_to_integer (newvalue)),        \
> -                          "m" (*mem));                                       \
> -     else                                                                    \
> -       {                                                                     \
> -        result = 0;                                                          \
> -        __atomic_link_error ();                                              \
> -       }                                                                     \
> -     result; })
> -
> -
> -#define __arch_exchange_and_add_body(lock, pfx, mem, value) \
> -  ({ __typeof (*mem) __result;                                               \
> -     __typeof (value) __addval = (value);                                    \
> -     if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile (lock "xaddb %b0, %1"                                \
> -                        : "=q" (__result), "=m" (*mem)                       \
> -                        : "0" (__addval), "m" (*mem),                        \
> -                          "i" (offsetof (tcbhead_t, multiple_threads)));     \
> -     else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile (lock "xaddw %w0, %1"                                \
> -                        : "=r" (__result), "=m" (*mem)                       \
> -                        : "0" (__addval), "m" (*mem),                        \
> -                          "i" (offsetof (tcbhead_t, multiple_threads)));     \
> -     else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile (lock "xaddl %0, %1"                                 \
> -                        : "=r" (__result), "=m" (*mem)                       \
> -                        : "0" (__addval), "m" (*mem),                        \
> -                          "i" (offsetof (tcbhead_t, multiple_threads)));     \
> -     else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile (lock "xaddq %q0, %1"                                \
> -                        : "=r" (__result), "=m" (*mem)                       \
> -                        : "0" ((int64_t) cast_to_integer (__addval)),     \
> -                          "m" (*mem),                                        \
> -                          "i" (offsetof (tcbhead_t, multiple_threads)));     \
> +     if (SINGLE_THREAD_P)                                                    \
> +       __cmpxchg_op ("", (mem), (newval), (oldval));                         \
>       else                                                                    \
> -       __result = do_exchange_and_add_val_64_acq (pfx, (mem), __addval);      \
> -     __result; })
> -
> -#define atomic_exchange_and_add(mem, value) \
> -  __sync_fetch_and_add (mem, value)
> -
> -#define __arch_exchange_and_add_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_exchange_and_add(mem, value) \
> -  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
> -                               mem, value)
> -
> -
> -#define __arch_add_body(lock, pfx, apfx, mem, value) \
> -  do {                                                                       \
> -    if (__builtin_constant_p (value) && (value) == 1)                        \
> -      pfx##_increment (mem);                                                 \
> -    else if (__builtin_constant_p (value) && (value) == -1)                  \
> -      pfx##_decrement (mem);                                                 \
> -    else if (sizeof (*mem) == 1)                                             \
> -      __asm __volatile (lock "addb %b1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : IBR_CONSTRAINT (value), "m" (*mem),                 \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "addw %w1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (value), "m" (*mem),                           \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "addl %1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (value), "m" (*mem),                           \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "addq %q1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" ((int64_t) cast_to_integer (value)),           \
> -                         "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      do_add_val_64_acq (apfx, (mem), (value));                                      \
> -  } while (0)
> -
> -# define atomic_add(mem, value) \
> -  __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
> -
> -#define __arch_add_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_add(mem, value) \
> -  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
> +       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));                \
> +     ret; })
>
>
> -#define atomic_add_negative(mem, value) \
> -  ({ unsigned char __result;                                                 \
> +#define __xchg_op(lock, mem, arg, op)                                        \
> +  ({ __typeof (*mem) __ret = (arg);                                          \
>       if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : IBR_CONSTRAINT (value), "m" (*mem));               \
> +       __asm __volatile (lock #op "b %b0, %1"                                \
> +                        : "=q" (__ret), "=m" (*mem)                          \
> +                        : "0" (arg), "m" (*mem)                              \
> +                        : "memory", "cc");                                   \
>       else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" (value), "m" (*mem));                         \
> +       __asm __volatile (lock #op "w %w0, %1"                                \
> +                        : "=r" (__ret), "=m" (*mem)                          \
> +                        : "0" (arg), "m" (*mem)                              \
> +                        : "memory", "cc");                                   \
>       else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1"                  \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" (value), "m" (*mem));                         \
> +       __asm __volatile (lock #op "l %0, %1"                                 \
> +                        : "=r" (__ret), "=m" (*mem)                          \
> +                        : "0" (arg), "m" (*mem)                              \
> +                        : "memory", "cc");                                   \
>       else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile (LOCK_PREFIX "addq %q2, %0; sets %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" ((int64_t) cast_to_integer (value)),          \
> -                          "m" (*mem));                                       \
> +       __asm __volatile (lock #op "q %q0, %1"                                \
> +                        : "=r" (__ret), "=m" (*mem)                          \
> +                        : "0" ((int64_t) cast_to_integer (arg)),             \
> +                          "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else                                                                    \
>         __atomic_link_error ();                                               \
> -     __result; })
> -
> +     __ret; })
>
> -#define atomic_add_zero(mem, value) \
> -  ({ unsigned char __result;                                                 \
> +#define __single_op(lock, mem, op)                                           \
> +  ({                                                                         \
>       if (sizeof (*mem) == 1)                                                 \
> -       __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : IBR_CONSTRAINT (value), "m" (*mem));               \
> +       __asm __volatile (lock #op "b %b0"                                    \
> +                        : "=m" (*mem)                                        \
> +                        : "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else if (sizeof (*mem) == 2)                                            \
> -       __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" (value), "m" (*mem));                         \
> +       __asm __volatile (lock #op "w %b0"                                    \
> +                        : "=m" (*mem)                                        \
> +                        : "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else if (sizeof (*mem) == 4)                                            \
> -       __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1"                  \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" (value), "m" (*mem));                         \
> +       __asm __volatile (lock #op "l %b0"                                    \
> +                        : "=m" (*mem)                                        \
> +                        : "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else if (__HAVE_64B_ATOMICS)                                            \
> -       __asm __volatile (LOCK_PREFIX "addq %q2, %0; setz %1"                 \
> -                        : "=m" (*mem), "=qm" (__result)                      \
> -                        : "ir" ((int64_t) cast_to_integer (value)),          \
> -                          "m" (*mem));                                       \
> +       __asm __volatile (lock #op "q %b0"                                    \
> +                        : "=m" (*mem)                                        \
> +                        : "m" (*mem)                                         \
> +                        : "memory", "cc");                                   \
>       else                                                                    \
> -       __atomic_link_error ();                                       \
> -     __result; })
> +       __atomic_link_error ();                                               \
> +  })
>
> +/* Note that we need no lock prefix.  */
> +#define atomic_exchange_acq(mem, newvalue)                                   \
> +  __xchg_op ("", (mem), (newvalue), xchg)
>
> -#define __arch_increment_body(lock, pfx, mem) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (lock "incb %b0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "incw %w0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "incl %0"                                       \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "incq %q0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      do_add_val_64_acq (pfx, mem, 1);                                       \
> -  } while (0)
> +#define atomic_add(mem, value) \
> +  __xchg_op (LOCK_PREFIX, (mem), (value), add);                                      \
>
> -#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
> +#define catomic_add(mem, value)                                                      \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __xchg_op ("", (mem), (value), add);                                   \
> +   else                                                                              \
> +     atomic_add (mem, value);                                                \
> +  })
>
> -#define __arch_increment_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
>
> -#define catomic_increment(mem) \
> -  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
> +#define atomic_increment(mem) \
> +  __single_op (LOCK_PREFIX, (mem), inc)
>
> +#define catomic_increment(mem)                                               \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __single_op ("", (mem), inc);                                          \
> +   else                                                                              \
> +     atomic_increment (mem);                                                 \
> +  })
>
>  #define atomic_increment_and_test(mem) \
>    ({ unsigned char __result;                                                 \
> @@ -357,43 +210,20 @@
>                          : "=m" (*mem), "=qm" (__result)                      \
>                          : "m" (*mem));                                       \
>       else                                                                    \
> -       __atomic_link_error ();                                       \
> +       __atomic_link_error ();                                               \
>       __result; })
>
>
> -#define __arch_decrement_body(lock, pfx, mem) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (lock "decb %b0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "decw %w0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "decl %0"                                       \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "decq %q0"                                              \
> -                       : "=m" (*mem)                                         \
> -                       : "m" (*mem),                                         \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      do_add_val_64_acq (pfx, mem, -1);                                              \
> -  } while (0)
> -
> -#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
> +#define atomic_decrement(mem)                                                \
> +  __single_op (LOCK_PREFIX, (mem), dec)
>
> -#define __arch_decrement_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
> -
> -#define catomic_decrement(mem) \
> -  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
> +#define catomic_decrement(mem)                                               \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __single_op ("", (mem), dec);                                          \
> +   else                                                                              \
> +     atomic_decrement (mem);                                                 \
> +  })
>
>
>  #define atomic_decrement_and_test(mem) \
> @@ -463,73 +293,31 @@
>                          : "=q" (__result), "=m" (*mem)                       \
>                          : "m" (*mem), "ir" (bit));                           \
>       else                                                                    \
> -       __atomic_link_error ();                                       \
> +       __atomic_link_error ();                                               \
>       __result; })
>
>
> -#define __arch_and_body(lock, mem, mask) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (lock "andb %b1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : IBR_CONSTRAINT (mask), "m" (*mem),                  \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "andw %w1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "andl %1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "andq %q1, %0"                                  \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      __atomic_link_error ();                                                \
> -  } while (0)
> -
> -#define __arch_cprefix \
> -  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
> -
> -#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
> -
> -#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
> +#define atomic_and(mem, mask)                                                \
> +  __xchg_op (LOCK_PREFIX, (mem), (mask), and)
>
> +#define catomic_and(mem, mask) \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __xchg_op ("", (mem), (mask), and);                                    \
> +   else                                                                              \
> +      atomic_and (mem, mask);                                                \
> +  })
>
> -#define __arch_or_body(lock, mem, mask) \
> -  do {                                                                       \
> -    if (sizeof (*mem) == 1)                                                  \
> -      __asm __volatile (lock "orb %b1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : IBR_CONSTRAINT (mask), "m" (*mem),                  \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 2)                                             \
> -      __asm __volatile (lock "orw %w1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (sizeof (*mem) == 4)                                             \
> -      __asm __volatile (lock "orl %1, %0"                                    \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else if (__HAVE_64B_ATOMICS)                                             \
> -      __asm __volatile (lock "orq %q1, %0"                                   \
> -                       : "=m" (*mem)                                         \
> -                       : "ir" (mask), "m" (*mem),                            \
> -                         "i" (offsetof (tcbhead_t, multiple_threads)));      \
> -    else                                                                     \
> -      __atomic_link_error ();                                                \
> -  } while (0)
> -
> -#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
> +#define atomic_or(mem, mask)                                                 \
> +  __xchg_op (LOCK_PREFIX, (mem), (mask), or)
>
> -#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
> +#define catomic_or(mem, mask) \
> +  ({                                                                         \
> +    if (SINGLE_THREAD_P)                                                     \
> +      __xchg_op ("", (mem), (mask), or);                                     \
> +   else                                                                              \
> +      atomic_or (mem, mask);                                                 \
> +  })
>
>  /* We don't use mfence because it is supposedly slower due to having to
>     provide stronger guarantees (e.g., regarding self-modifying code).  */
> diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
> index 2bbd563a6c..8ec55a7ea8 100644
> --- a/sysdeps/x86_64/nptl/tcb-offsets.sym
> +++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
> @@ -9,7 +9,6 @@ CLEANUP_JMP_BUF         offsetof (struct pthread, cleanup_jmp_buf)
>  CLEANUP                        offsetof (struct pthread, cleanup)
>  CLEANUP_PREV           offsetof (struct _pthread_cleanup_buffer, __prev)
>  MUTEX_FUTEX            offsetof (pthread_mutex_t, __data.__lock)
> -MULTIPLE_THREADS_OFFSET        offsetof (tcbhead_t, multiple_threads)
>  POINTER_GUARD          offsetof (tcbhead_t, pointer_guard)
>  FEATURE_1_OFFSET       offsetof (tcbhead_t, feature_1)
>  SSP_BASE_OFFSET                offsetof (tcbhead_t, ssp_base)
> --
> 2.34.1
>
Wilco Dijkstra June 11, 2022, 1:59 p.m. UTC | #3
Hi Noah,

> +#define __cmpxchg_op(lock, mem, newval, oldval)                                      \
> +  ({ __typeof (*mem) __ret;                                                  \
> +     if (sizeof (*mem) == 1)                                                 \
> +       asm volatile (lock "cmpxchgb %2, %1"                                  \
> +                    : "=a" (ret), "+m" (*mem)                                \
> +                    : BR_CONSTRAINT (newval), "0" (oldval)                   \
> +                    : "memory");                                             \

> Is the full "memory" clobber needed? Shouldn't the "+m"(*mem) be enough?

For use in acquire/release atomics, it is required since code hoisting and other
optimizations must be prevented. So the old implementation was buggy, and this
is why we need to remove these target specific hacks.

Also it is ridiculous to write hundreds of lines of hacky inline assembler for basic macros
like atomic_bit_test_set when there is only a single use in all of GLIBC which can trivially
be replaced with the compiler builtin __atomic_fetch_or.

Any single-threaded optimizations should be done on a much higher level and only where
there is a clear performance gain. So we should get rid of all the atomic-machine headers.

Cheers,
Wilco
Wilco Dijkstra June 13, 2022, 9:31 p.m. UTC | #4
Hi Adhemerval,

> The x86 atomic need some adjustments since it has single-thread
> optimizationi builtin within the inline assemblye.  It now uses
> SINGLE_THREAD_P and atomic optimizations are removed (since they
> are not used).

I'd suggest to remove all single-thread optimizations from target atomics.
Many aren't used at all (eg. catomic_or/catomic_and), and the rest has few
uses which are not performance critical. The uses in malloc.c are in fact
counterproductive since they are only used if there are multiple threads!
As a result I think you might not need any inline assembler at all.

I have a patch that removes all of the catomic definitions across GLIBC.
If we find code that could benefit then we can optimize it at a higher level
and in a generic way similar to how we optimized malloc or 
https://sourceware.org/pipermail/libc-alpha/2022-June/139566.html.

Cheers,
Wilco
Adhemerval Zanella Netto June 15, 2022, 9:07 p.m. UTC | #5
> On 11 Jun 2022, at 06:59, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> 
> Hi Noah,
> 
>> +#define __cmpxchg_op(lock, mem, newval, oldval)                                      \
>> +  ({ __typeof (*mem) __ret;                                                  \
>> +     if (sizeof (*mem) == 1)                                                 \
>> +       asm volatile (lock "cmpxchgb %2, %1"                                  \
>> +                    : "=a" (ret), "+m" (*mem)                                \
>> +                    : BR_CONSTRAINT (newval), "0" (oldval)                   \
>> +                    : "memory");                                             \
> 
>> Is the full "memory" clobber needed? Shouldn't the "+m"(*mem) be enough?
> 
> For use in acquire/release atomics, it is required since code hoisting and other
> optimizations must be prevented. So the old implementation was buggy, and this
> is why we need to remove these target specific hacks.

Yes, I noted this checking out the Linux kernel implementation. Although I am not
sure if it really matter since we already have a volatile asm to should prevent code
hoisting. 

> 
> Also it is ridiculous to write hundreds of lines of hacky inline assembler for basic macros
> like atomic_bit_test_set when there is only a single use in all of GLIBC which can trivially
> be replaced with the compiler builtin __atomic_fetch_or.

Indeed, afaik it was done back when gcc did not provide atomic builtins over different 
architectures so each one need to implement it. For newer code we have started to
require to use the new C11-like atomic macros, and I have started to send some fixes
to move away for old atomics (for instance the pthread_cancel revert). 

> 
> Any single-threaded optimizations should be done on a much higher level and only where
> there is a clear performance gain. So we should get rid of all the atomic-machine headers.

Complete agree, I have started to clean up by first moving some architectures to use
compiler builtins [1]. I will check which are the architectures that still don’t use compiler
builtin and see if we move them.

[1] https://patchwork.sourceware.org/project/glibc/patch/20210929191430.884057-1-adhemerval.zanella@linaro.org/
Adhemerval Zanella Netto June 15, 2022, 9:10 p.m. UTC | #6
> On 13 Jun 2022, at 14:31, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> 
> Hi Adhemerval,
> 
>> The x86 atomic need some adjustments since it has single-thread
>> optimizationi builtin within the inline assemblye.  It now uses
>> SINGLE_THREAD_P and atomic optimizations are removed (since they
>> are not used).
> 
> I'd suggest to remove all single-thread optimizations from target atomics.
> Many aren't used at all (eg. catomic_or/catomic_and), and the rest has few
> uses which are not performance critical. The uses in malloc.c are in fact
> counterproductive since they are only used if there are multiple threads!
> As a result I think you might not need any inline assembler at all.

That is my plan, I haven’t done this on this patchset because I want first to
consolidate the single-thread optimization to use only one internal scheme.

Once it is in, my plan is to move all architectures to use compiler builtins
if possible (I recall that the minimum gcc version was prevent us to do so
on some architectures, but I think we might be able to do it now), then
remove all the unused atomic macros, and make all internal ones use the
C11-like internal macros.

> 
> I have a patch that removes all of the catomic definitions across GLIBC.
> If we find code that could benefit then we can optimize it at a higher level
> and in a generic way similar to how we optimized malloc or 
> https://sourceware.org/pipermail/libc-alpha/2022-June/139566.html.

Yes, it is on my backlog to review it.
Fangrui Song June 16, 2022, 7:35 a.m. UTC | #7
On 2022-06-10, Adhemerval Zanella via Libc-alpha wrote:
>Instead use __libc_single_threaded on all architectures.  The TCB
>field is renamed to avoid change the struct layout.
>
>The x86 atomic need some adjustments since it has single-thread
>optimizationi builtin within the inline assemblye.  It now uses
>SINGLE_THREAD_P and atomic optimizations are removed (since they
>are not used).
>
>Checked on x86_64-linux-gnu and i686-linux-gnu.
>---
> misc/tst-atomic.c                       |   1 +
> nptl/allocatestack.c                    |   6 -
> nptl/descr.h                            |  17 +-
> nptl/pthread_cancel.c                   |   7 +-
> nptl/pthread_create.c                   |   5 -
> sysdeps/i386/htl/tcb-offsets.sym        |   1 -
> sysdeps/i386/nptl/tcb-offsets.sym       |   1 -
> sysdeps/i386/nptl/tls.h                 |   4 +-
> sysdeps/ia64/nptl/tcb-offsets.sym       |   1 -
> sysdeps/ia64/nptl/tls.h                 |   2 -
> sysdeps/mach/hurd/i386/tls.h            |   4 +-
> sysdeps/nios2/nptl/tcb-offsets.sym      |   1 -
> sysdeps/or1k/nptl/tls.h                 |   2 -
> sysdeps/powerpc/nptl/tcb-offsets.sym    |   3 -
> sysdeps/powerpc/nptl/tls.h              |   3 -
> sysdeps/s390/nptl/tcb-offsets.sym       |   1 -
> sysdeps/s390/nptl/tls.h                 |   6 +-
> sysdeps/sh/nptl/tcb-offsets.sym         |   1 -
> sysdeps/sh/nptl/tls.h                   |   2 -
> sysdeps/sparc/nptl/tcb-offsets.sym      |   1 -
> sysdeps/sparc/nptl/tls.h                |   2 +-
> sysdeps/unix/sysv/linux/single-thread.h |  15 +-
> sysdeps/x86/atomic-machine.h            | 484 +++++++-----------------
> sysdeps/x86_64/nptl/tcb-offsets.sym     |   1 -
> 24 files changed, 145 insertions(+), 426 deletions(-)
>
>diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
>index 6d681a7bfd..ddbc618e25 100644
>--- a/misc/tst-atomic.c
>+++ b/misc/tst-atomic.c
>@@ -18,6 +18,7 @@
>
> #include <stdio.h>
> #include <atomic.h>
>+#include <support/xthread.h>
>
> #ifndef atomic_t
> # define atomic_t int
>diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
>index 98f5f6dd85..3e0d01cb52 100644
>--- a/nptl/allocatestack.c
>+++ b/nptl/allocatestack.c
>@@ -290,9 +290,6 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
> 	 stack cache nor will the memory (except the TLS memory) be freed.  */
>       pd->user_stack = true;
>
>-      /* This is at least the second thread.  */
>-      pd->header.multiple_threads = 1;
>-
> #ifdef NEED_DL_SYSINFO
>       SETUP_THREAD_SYSINFO (pd);
> #endif
>@@ -408,9 +405,6 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
> 	     descriptor.  */
> 	  pd->specific[0] = pd->specific_1stblock;
>
>-	  /* This is at least the second thread.  */
>-	  pd->header.multiple_threads = 1;
>-
> #ifdef NEED_DL_SYSINFO
> 	  SETUP_THREAD_SYSINFO (pd);
> #endif
>diff --git a/nptl/descr.h b/nptl/descr.h
>index bb46b5958e..77b25d8267 100644
>--- a/nptl/descr.h
>+++ b/nptl/descr.h
>@@ -137,22 +137,7 @@ struct pthread
> #else
>     struct
>     {
>-      /* multiple_threads is enabled either when the process has spawned at
>-	 least one thread or when a single-threaded process cancels itself.
>-	 This enables additional code to introduce locking before doing some
>-	 compare_and_exchange operations and also enable cancellation points.
>-	 The concepts of multiple threads and cancellation points ideally
>-	 should be separate, since it is not necessary for multiple threads to
>-	 have been created for cancellation points to be enabled, as is the
>-	 case is when single-threaded process cancels itself.
>-
>-	 Since enabling multiple_threads enables additional code in
>-	 cancellation points and compare_and_exchange operations, there is a
>-	 potential for an unneeded performance hit when it is enabled in a
>-	 single-threaded, self-canceling process.  This is OK though, since a
>-	 single-threaded process will enable async cancellation only when it
>-	 looks to cancel itself and is hence going to end anyway.  */
>-      int multiple_threads;
>+      int unused_multiple_threads;

For an unused member variable: I see that sometimes a name like
__glibc_unused1 is used.  Is unused_ preferred now?

>       int gscope_flag;
>     } header;
> #endif
>diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
>index e1735279f2..6d26a15d0e 100644
>--- a/nptl/pthread_cancel.c
>+++ b/nptl/pthread_cancel.c
>@@ -157,12 +157,9 @@ __pthread_cancel (pthread_t th)
>
> 	/* A single-threaded process should be able to kill itself, since
> 	   there is nothing in the POSIX specification that says that it
>-	   cannot.  So we set multiple_threads to true so that cancellation
>-	   points get executed.  */
>-	THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
>-#ifndef TLS_MULTIPLE_THREADS_IN_TCB
>+	   cannot.  So we set __libc_single_threaded to true so that
>+	   cancellation points get executed.  */
> 	__libc_single_threaded = 0;
>-#endif
>     }
>   while (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling, &oldval,
> 						newval));
>diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
>index 5633d01c62..d43865352f 100644
>--- a/nptl/pthread_create.c
>+++ b/nptl/pthread_create.c
>@@ -882,11 +882,6 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
> 	   other reason that create_thread chose.  Now let it run
> 	   free.  */
> 	lll_unlock (pd->lock, LLL_PRIVATE);
>-
>-      /* We now have for sure more than one thread.  The main thread might
>-	 not yet have the flag set.  No need to set the global variable
>-	 again if this is what we use.  */
>-      THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
>     }
>
>  out:
>diff --git a/sysdeps/i386/htl/tcb-offsets.sym b/sysdeps/i386/htl/tcb-offsets.sym
>index 7b7c719369..f3f7df6c06 100644
>--- a/sysdeps/i386/htl/tcb-offsets.sym
>+++ b/sysdeps/i386/htl/tcb-offsets.sym
>@@ -2,7 +2,6 @@
> #include <tls.h>
> #include <kernel-features.h>
>
>-MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads)
> SYSINFO_OFFSET          offsetof (tcbhead_t, sysinfo)
> POINTER_GUARD           offsetof (tcbhead_t, pointer_guard)
> SIGSTATE_OFFSET         offsetof (tcbhead_t, _hurd_sigstate)
>diff --git a/sysdeps/i386/nptl/tcb-offsets.sym b/sysdeps/i386/nptl/tcb-offsets.sym
>index 2ec9e787c1..1efd1469d8 100644
>--- a/sysdeps/i386/nptl/tcb-offsets.sym
>+++ b/sysdeps/i386/nptl/tcb-offsets.sym
>@@ -6,7 +6,6 @@ RESULT			offsetof (struct pthread, result)
> TID			offsetof (struct pthread, tid)
> CANCELHANDLING		offsetof (struct pthread, cancelhandling)
> CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
>-MULTIPLE_THREADS_OFFSET	offsetof (tcbhead_t, multiple_threads)
> SYSINFO_OFFSET		offsetof (tcbhead_t, sysinfo)
> CLEANUP			offsetof (struct pthread, cleanup)
> CLEANUP_PREV		offsetof (struct _pthread_cleanup_buffer, __prev)
>diff --git a/sysdeps/i386/nptl/tls.h b/sysdeps/i386/nptl/tls.h
>index 91090bf287..48940a9f44 100644
>--- a/sysdeps/i386/nptl/tls.h
>+++ b/sysdeps/i386/nptl/tls.h
>@@ -36,7 +36,7 @@ typedef struct
> 			   thread descriptor used by libpthread.  */
>   dtv_t *dtv;
>   void *self;		/* Pointer to the thread descriptor.  */
>-  int multiple_threads;
>+  int unused_multiple_threads;
>   uintptr_t sysinfo;
>   uintptr_t stack_guard;
>   uintptr_t pointer_guard;
>@@ -57,8 +57,6 @@ typedef struct
> _Static_assert (offsetof (tcbhead_t, __private_ss) == 0x30,
> 		"offset of __private_ss != 0x30");
>
>-# define TLS_MULTIPLE_THREADS_IN_TCB 1
>-
> #else /* __ASSEMBLER__ */
> # include <tcb-offsets.h>
> #endif
>diff --git a/sysdeps/ia64/nptl/tcb-offsets.sym b/sysdeps/ia64/nptl/tcb-offsets.sym
>index b01f712be2..ab2cb180f9 100644
>--- a/sysdeps/ia64/nptl/tcb-offsets.sym
>+++ b/sysdeps/ia64/nptl/tcb-offsets.sym
>@@ -2,5 +2,4 @@
> #include <tls.h>
>
> TID			offsetof (struct pthread, tid) - TLS_PRE_TCB_SIZE
>-MULTIPLE_THREADS_OFFSET offsetof (struct pthread, header.multiple_threads) - TLS_PRE_TCB_SIZE
> SYSINFO_OFFSET		offsetof (tcbhead_t, __private)
>diff --git a/sysdeps/ia64/nptl/tls.h b/sysdeps/ia64/nptl/tls.h
>index 8ccedb73e6..008e080fc4 100644
>--- a/sysdeps/ia64/nptl/tls.h
>+++ b/sysdeps/ia64/nptl/tls.h
>@@ -36,8 +36,6 @@ typedef struct
>
> register struct pthread *__thread_self __asm__("r13");
>
>-# define TLS_MULTIPLE_THREADS_IN_TCB 1
>-
> #else /* __ASSEMBLER__ */
> # include <tcb-offsets.h>
> #endif
>diff --git a/sysdeps/mach/hurd/i386/tls.h b/sysdeps/mach/hurd/i386/tls.h
>index 264ed9a9c5..d33e91c922 100644
>--- a/sysdeps/mach/hurd/i386/tls.h
>+++ b/sysdeps/mach/hurd/i386/tls.h
>@@ -33,7 +33,7 @@ typedef struct
>   void *tcb;			/* Points to this structure.  */
>   dtv_t *dtv;			/* Vector of pointers to TLS data.  */
>   thread_t self;		/* This thread's control port.  */
>-  int multiple_threads;
>+  int unused_multiple_threads;
>   uintptr_t sysinfo;
>   uintptr_t stack_guard;
>   uintptr_t pointer_guard;
>@@ -117,8 +117,6 @@ _hurd_tls_init (tcbhead_t *tcb)
>   /* This field is used by TLS accesses to get our "thread pointer"
>      from the TLS point of view.  */
>   tcb->tcb = tcb;
>-  /* We always at least start the sigthread anyway.  */
>-  tcb->multiple_threads = 1;
>
>   /* Get the first available selector.  */
>   int sel = -1;
>diff --git a/sysdeps/nios2/nptl/tcb-offsets.sym b/sysdeps/nios2/nptl/tcb-offsets.sym
>index 3cd8d984ac..93a695ac7f 100644
>--- a/sysdeps/nios2/nptl/tcb-offsets.sym
>+++ b/sysdeps/nios2/nptl/tcb-offsets.sym
>@@ -8,6 +8,5 @@
> # define __thread_self          ((void *) 0)
> # define thread_offsetof(mem)   ((ptrdiff_t) THREAD_SELF + offsetof (struct pthread, mem))
>
>-MULTIPLE_THREADS_OFFSET		thread_offsetof (header.multiple_threads)
> TID_OFFSET			thread_offsetof (tid)
> POINTER_GUARD			(offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
>diff --git a/sysdeps/or1k/nptl/tls.h b/sysdeps/or1k/nptl/tls.h
>index c6ffe62c3f..3bb07beef8 100644
>--- a/sysdeps/or1k/nptl/tls.h
>+++ b/sysdeps/or1k/nptl/tls.h
>@@ -35,8 +35,6 @@ typedef struct
>
> register tcbhead_t *__thread_self __asm__("r10");
>
>-# define TLS_MULTIPLE_THREADS_IN_TCB 1
>-
> /* Get system call information.  */
> # include <sysdep.h>
>
>diff --git a/sysdeps/powerpc/nptl/tcb-offsets.sym b/sysdeps/powerpc/nptl/tcb-offsets.sym
>index 4c01615ad0..a0ee95f94d 100644
>--- a/sysdeps/powerpc/nptl/tcb-offsets.sym
>+++ b/sysdeps/powerpc/nptl/tcb-offsets.sym
>@@ -10,9 +10,6 @@
> # define thread_offsetof(mem)	((ptrdiff_t) THREAD_SELF + offsetof (struct pthread, mem))
>
>
>-#if TLS_MULTIPLE_THREADS_IN_TCB
>-MULTIPLE_THREADS_OFFSET		thread_offsetof (header.multiple_threads)
>-#endif
> TID				thread_offsetof (tid)
> POINTER_GUARD			(offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
> TAR_SAVE			(offsetof (tcbhead_t, tar_save) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
>diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
>index 22b0075235..fd5ee51981 100644
>--- a/sysdeps/powerpc/nptl/tls.h
>+++ b/sysdeps/powerpc/nptl/tls.h
>@@ -52,9 +52,6 @@
> # define TLS_DTV_AT_TP	1
> # define TLS_TCB_AT_TP	0
>
>-/* We use the multiple_threads field in the pthread struct */
>-#define TLS_MULTIPLE_THREADS_IN_TCB	1
>-
> /* Get the thread descriptor definition.  */
> # include <nptl/descr.h>
>
>diff --git a/sysdeps/s390/nptl/tcb-offsets.sym b/sysdeps/s390/nptl/tcb-offsets.sym
>index 9c1c01f353..bc7b267463 100644
>--- a/sysdeps/s390/nptl/tcb-offsets.sym
>+++ b/sysdeps/s390/nptl/tcb-offsets.sym
>@@ -1,6 +1,5 @@
> #include <sysdep.h>
> #include <tls.h>
>
>-MULTIPLE_THREADS_OFFSET		offsetof (tcbhead_t, multiple_threads)
> STACK_GUARD			offsetof (tcbhead_t, stack_guard)
> TID				offsetof (struct pthread, tid)
>diff --git a/sysdeps/s390/nptl/tls.h b/sysdeps/s390/nptl/tls.h
>index ff210ffeb2..d69ed539f7 100644
>--- a/sysdeps/s390/nptl/tls.h
>+++ b/sysdeps/s390/nptl/tls.h
>@@ -35,7 +35,7 @@ typedef struct
> 			   thread descriptor used by libpthread.  */
>   dtv_t *dtv;
>   void *self;		/* Pointer to the thread descriptor.  */
>-  int multiple_threads;
>+  int unused_multiple_threads;
>   uintptr_t sysinfo;
>   uintptr_t stack_guard;
>   int gscope_flag;
>@@ -44,10 +44,6 @@ typedef struct
>   void *__private_ss;
> } tcbhead_t;
>
>-# ifndef __s390x__
>-#  define TLS_MULTIPLE_THREADS_IN_TCB 1
>-# endif
>-
> #else /* __ASSEMBLER__ */
> # include <tcb-offsets.h>
> #endif
>diff --git a/sysdeps/sh/nptl/tcb-offsets.sym b/sysdeps/sh/nptl/tcb-offsets.sym
>index 234207779d..4e452d9c6c 100644
>--- a/sysdeps/sh/nptl/tcb-offsets.sym
>+++ b/sysdeps/sh/nptl/tcb-offsets.sym
>@@ -6,7 +6,6 @@ RESULT			offsetof (struct pthread, result)
> TID			offsetof (struct pthread, tid)
> CANCELHANDLING		offsetof (struct pthread, cancelhandling)
> CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
>-MULTIPLE_THREADS_OFFSET	offsetof (struct pthread, header.multiple_threads)
> TLS_PRE_TCB_SIZE	sizeof (struct pthread)
> MUTEX_FUTEX		offsetof (pthread_mutex_t, __data.__lock)
> POINTER_GUARD		offsetof (tcbhead_t, pointer_guard)
>diff --git a/sysdeps/sh/nptl/tls.h b/sysdeps/sh/nptl/tls.h
>index 76591ab6ef..8778cb4ac0 100644
>--- a/sysdeps/sh/nptl/tls.h
>+++ b/sysdeps/sh/nptl/tls.h
>@@ -36,8 +36,6 @@ typedef struct
>   uintptr_t pointer_guard;
> } tcbhead_t;
>
>-# define TLS_MULTIPLE_THREADS_IN_TCB 1
>-
> #else /* __ASSEMBLER__ */
> # include <tcb-offsets.h>
> #endif /* __ASSEMBLER__ */
>diff --git a/sysdeps/sparc/nptl/tcb-offsets.sym b/sysdeps/sparc/nptl/tcb-offsets.sym
>index f75d02065e..e4a7e4720f 100644
>--- a/sysdeps/sparc/nptl/tcb-offsets.sym
>+++ b/sysdeps/sparc/nptl/tcb-offsets.sym
>@@ -1,6 +1,5 @@
> #include <sysdep.h>
> #include <tls.h>
>
>-MULTIPLE_THREADS_OFFSET		offsetof (tcbhead_t, multiple_threads)
> POINTER_GUARD			offsetof (tcbhead_t, pointer_guard)
> TID				offsetof (struct pthread, tid)
>diff --git a/sysdeps/sparc/nptl/tls.h b/sysdeps/sparc/nptl/tls.h
>index d1e2bb4ad1..b78cf0d6b4 100644
>--- a/sysdeps/sparc/nptl/tls.h
>+++ b/sysdeps/sparc/nptl/tls.h
>@@ -35,7 +35,7 @@ typedef struct
> 			   thread descriptor used by libpthread.  */
>   dtv_t *dtv;
>   void *self;
>-  int multiple_threads;
>+  int unused_multiple_threads;
> #if __WORDSIZE == 64
>   int gscope_flag;
> #endif
>diff --git a/sysdeps/unix/sysv/linux/single-thread.h b/sysdeps/unix/sysv/linux/single-thread.h
>index 208edccce6..dd80e82c82 100644
>--- a/sysdeps/unix/sysv/linux/single-thread.h
>+++ b/sysdeps/unix/sysv/linux/single-thread.h
>@@ -23,20 +23,7 @@
> # include <sys/single_threaded.h>
> #endif
>
>-/* The default way to check if the process is single thread is by using the
>-   pthread_t 'multiple_threads' field.  However, for some architectures it is
>-   faster to either use an extra field on TCB or global variables (the TCB
>-   field is also used on x86 for some single-thread atomic optimizations).
>-
>-   The ABI might define SINGLE_THREAD_BY_GLOBAL to enable the single thread
>-   check to use global variables instead of the pthread_t field.  */
>-
>-#if !defined SINGLE_THREAD_BY_GLOBAL || IS_IN (rtld)
>-# define SINGLE_THREAD_P \
>-  (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
>-#else
>-# define SINGLE_THREAD_P (__libc_single_threaded != 0)
>-#endif
>+#define SINGLE_THREAD_P (__libc_single_threaded != 0)
>
> #define RTLD_SINGLE_THREAD_P SINGLE_THREAD_P
>
>diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
>index f24f1c71ed..23e087e7e0 100644
>--- a/sysdeps/x86/atomic-machine.h
>+++ b/sysdeps/x86/atomic-machine.h
>@@ -51,292 +51,145 @@
> #define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
>   (! __sync_bool_compare_and_swap (mem, oldval, newval))
>
>+#define __cmpxchg_op(lock, mem, newval, oldval)				      \
>+  ({ __typeof (*mem) __ret;						      \
>+     if (sizeof (*mem) == 1)						      \
>+       asm volatile (lock "cmpxchgb %2, %1"				      \
>+		     : "=a" (ret), "+m" (*mem)				      \
>+		     : BR_CONSTRAINT (newval), "0" (oldval)	  	      \
>+		     : "memory");					      \
>+     else if (sizeof (*mem) == 2)					      \
>+       asm volatile (lock "cmpxchgw %2, %1"				      \
>+		     : "=a" (ret), "+m" (*mem)				      \
>+		     : BR_CONSTRAINT (newval), "0" (oldval)	  	      \
>+		     : "memory");					      \
>+     else if (sizeof (*mem) == 4)					      \
>+       asm volatile (lock "cmpxchgl %2, %1"				      \
>+		     : "=a" (ret), "+m" (*mem)				      \
>+		     : BR_CONSTRAINT (newval), "0" (oldval)	  	      \
>+		     : "memory");					      \
>+     else if (__HAVE_64B_ATOMICS)					      \
>+       asm volatile (lock "cmpxchgq %2, %1"				      \
>+                    : "=a" (ret), "+m" (*mem)				      \
>+                    : "q" ((int64_t) cast_to_integer (newval)),		      \
>+                      "0" ((int64_t) cast_to_integer (oldval))		      \
>+                    : "memory");					      \
>+     else								      \
>+       __atomic_link_error ();						      \
>+     __ret; })
>
>-#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
>+#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval)	      \
>   ({ __typeof (*mem) ret;						      \
>-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
>-		       "je 0f\n\t"					      \
>-		       "lock\n"						      \
>-		       "0:\tcmpxchgb %b2, %1"				      \
>-		       : "=a" (ret), "=m" (*mem)			      \
>-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
>-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
>+     if (SINGLE_THREAD_P)						      \
>+       __cmpxchg_op ("", (mem), (newval), (oldval));			      \
>+     else								      \
>+       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));		      \
>      ret; })
>
>-#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
>+#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval)	      \
>   ({ __typeof (*mem) ret;						      \
>-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
>-		       "je 0f\n\t"					      \
>-		       "lock\n"						      \
>-		       "0:\tcmpxchgw %w2, %1"				      \
>-		       : "=a" (ret), "=m" (*mem)			      \
>-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
>-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
>+     if (SINGLE_THREAD_P)						      \
>+       __cmpxchg_op ("", (mem), (newval), (oldval));			      \
>+     else								      \
>+       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));		      \
>      ret; })
>
>-#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
>+#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval)	      \
>   ({ __typeof (*mem) ret;						      \
>-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
>-		       "je 0f\n\t"					      \
>-		       "lock\n"						      \
>-		       "0:\tcmpxchgl %2, %1"				      \
>-		       : "=a" (ret), "=m" (*mem)			      \
>-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
>-			 "i" (offsetof (tcbhead_t, multiple_threads)));       \
>+     if (SINGLE_THREAD_P)						      \
>+       __cmpxchg_op ("", (mem), (newval), (oldval));			      \
>+     else								      \
>+       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));		      \
>      ret; })
>
>-#ifdef __x86_64__
>-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
>+#define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
>   ({ __typeof (*mem) ret;						      \
>-     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
>-		       "je 0f\n\t"					      \
>-		       "lock\n"						      \
>-		       "0:\tcmpxchgq %q2, %1"				      \
>-		       : "=a" (ret), "=m" (*mem)			      \
>-		       : "q" ((int64_t) cast_to_integer (newval)),	      \
>-			 "m" (*mem),					      \
>-			 "0" ((int64_t) cast_to_integer (oldval)),	      \
>-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
>-     ret; })
>-# define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
>-# define do_add_val_64_acq(pfx, mem, value) do { } while (0)
>-#else
>-/* XXX We do not really need 64-bit compare-and-exchange.  At least
>-   not in the moment.  Using it would mean causing portability
>-   problems since not many other 32-bit architectures have support for
>-   such an operation.  So don't define any code for now.  If it is
>-   really going to be used the code below can be used on Intel Pentium
>-   and later, but NOT on i486.  */
>-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
>-  ({ __typeof (*mem) ret = *(mem);					      \
>-     __atomic_link_error ();						      \
>-     ret = (newval);							      \
>-     ret = (oldval);							      \
>-     ret; })
>-
>-# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
>-  ({ __typeof (*mem) ret = *(mem);					      \
>-     __atomic_link_error ();						      \
>-     ret = (newval);							      \
>-     ret = (oldval);							      \
>-     ret; })
>-
>-# define do_exchange_and_add_val_64_acq(pfx, mem, value) \
>-  ({ __typeof (value) __addval = (value);				      \
>-     __typeof (*mem) __result;						      \
>-     __typeof (mem) __memp = (mem);					      \
>-     __typeof (*mem) __tmpval;						      \
>-     __result = *__memp;						      \
>-     do									      \
>-       __tmpval = __result;						      \
>-     while ((__result = pfx##_compare_and_exchange_val_64_acq		      \
>-	     (__memp, __result + __addval, __result)) == __tmpval);	      \
>-     __result; })
>-
>-# define do_add_val_64_acq(pfx, mem, value) \
>-  {									      \
>-    __typeof (value) __addval = (value);				      \
>-    __typeof (mem) __memp = (mem);					      \
>-    __typeof (*mem) __oldval = *__memp;					      \
>-    __typeof (*mem) __tmpval;						      \
>-    do									      \
>-      __tmpval = __oldval;						      \
>-    while ((__oldval = pfx##_compare_and_exchange_val_64_acq		      \
>-	    (__memp, __oldval + __addval, __oldval)) == __tmpval);	      \
>-  }
>-#endif
>-
>-
>-/* Note that we need no lock prefix.  */
>-#define atomic_exchange_acq(mem, newvalue) \
>-  ({ __typeof (*mem) result;						      \
>-     if (sizeof (*mem) == 1)						      \
>-       __asm __volatile ("xchgb %b0, %1"				      \
>-			 : "=q" (result), "=m" (*mem)			      \
>-			 : "0" (newvalue), "m" (*mem));			      \
>-     else if (sizeof (*mem) == 2)					      \
>-       __asm __volatile ("xchgw %w0, %1"				      \
>-			 : "=r" (result), "=m" (*mem)			      \
>-			 : "0" (newvalue), "m" (*mem));			      \
>-     else if (sizeof (*mem) == 4)					      \
>-       __asm __volatile ("xchgl %0, %1"					      \
>-			 : "=r" (result), "=m" (*mem)			      \
>-			 : "0" (newvalue), "m" (*mem));			      \
>-     else if (__HAVE_64B_ATOMICS)					      \
>-       __asm __volatile ("xchgq %q0, %1"				      \
>-			 : "=r" (result), "=m" (*mem)			      \
>-			 : "0" ((int64_t) cast_to_integer (newvalue)),        \
>-			   "m" (*mem));					      \
>-     else								      \
>-       {								      \
>-	 result = 0;							      \
>-	 __atomic_link_error ();					      \
>-       }								      \
>-     result; })
>-
>-
>-#define __arch_exchange_and_add_body(lock, pfx, mem, value) \
>-  ({ __typeof (*mem) __result;						      \
>-     __typeof (value) __addval = (value);				      \
>-     if (sizeof (*mem) == 1)						      \
>-       __asm __volatile (lock "xaddb %b0, %1"				      \
>-			 : "=q" (__result), "=m" (*mem)			      \
>-			 : "0" (__addval), "m" (*mem),			      \
>-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
>-     else if (sizeof (*mem) == 2)					      \
>-       __asm __volatile (lock "xaddw %w0, %1"				      \
>-			 : "=r" (__result), "=m" (*mem)			      \
>-			 : "0" (__addval), "m" (*mem),			      \
>-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
>-     else if (sizeof (*mem) == 4)					      \
>-       __asm __volatile (lock "xaddl %0, %1"				      \
>-			 : "=r" (__result), "=m" (*mem)			      \
>-			 : "0" (__addval), "m" (*mem),			      \
>-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
>-     else if (__HAVE_64B_ATOMICS)					      \
>-       __asm __volatile (lock "xaddq %q0, %1"				      \
>-			 : "=r" (__result), "=m" (*mem)			      \
>-			 : "0" ((int64_t) cast_to_integer (__addval)),     \
>-			   "m" (*mem),					      \
>-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
>+     if (SINGLE_THREAD_P)						      \
>+       __cmpxchg_op ("", (mem), (newval), (oldval));			      \
>      else								      \
>-       __result = do_exchange_and_add_val_64_acq (pfx, (mem), __addval);      \
>-     __result; })
>-
>-#define atomic_exchange_and_add(mem, value) \
>-  __sync_fetch_and_add (mem, value)
>-
>-#define __arch_exchange_and_add_cprefix \
>-  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
>-
>-#define catomic_exchange_and_add(mem, value) \
>-  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
>-				mem, value)
>-
>-
>-#define __arch_add_body(lock, pfx, apfx, mem, value) \
>-  do {									      \
>-    if (__builtin_constant_p (value) && (value) == 1)			      \
>-      pfx##_increment (mem);						      \
>-    else if (__builtin_constant_p (value) && (value) == -1)		      \
>-      pfx##_decrement (mem);						      \
>-    else if (sizeof (*mem) == 1)					      \
>-      __asm __volatile (lock "addb %b1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: IBR_CONSTRAINT (value), "m" (*mem),		      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 2)					      \
>-      __asm __volatile (lock "addw %w1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" (value), "m" (*mem),			      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 4)					      \
>-      __asm __volatile (lock "addl %1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" (value), "m" (*mem),			      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (__HAVE_64B_ATOMICS)					      \
>-      __asm __volatile (lock "addq %q1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" ((int64_t) cast_to_integer (value)),	      \
>-			  "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else								      \
>-      do_add_val_64_acq (apfx, (mem), (value));				      \
>-  } while (0)
>-
>-# define atomic_add(mem, value) \
>-  __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
>-
>-#define __arch_add_cprefix \
>-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
>-
>-#define catomic_add(mem, value) \
>-  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
>+       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));		      \
>+     ret; })
>
>
>-#define atomic_add_negative(mem, value) \
>-  ({ unsigned char __result;						      \
>+#define __xchg_op(lock, mem, arg, op)					      \
>+  ({ __typeof (*mem) __ret = (arg);					      \
>      if (sizeof (*mem) == 1)						      \
>-       __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1"		      \
>-			 : "=m" (*mem), "=qm" (__result)		      \
>-			 : IBR_CONSTRAINT (value), "m" (*mem));		      \
>+       __asm __volatile (lock #op "b %b0, %1"				      \
>+			 : "=q" (__ret), "=m" (*mem)			      \
>+			 : "0" (arg), "m" (*mem)			      \
>+			 : "memory", "cc");				      \
>      else if (sizeof (*mem) == 2)					      \
>-       __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1"		      \
>-			 : "=m" (*mem), "=qm" (__result)		      \
>-			 : "ir" (value), "m" (*mem));			      \
>+       __asm __volatile (lock #op "w %w0, %1"				      \
>+			 : "=r" (__ret), "=m" (*mem)			      \
>+			 : "0" (arg), "m" (*mem)			      \
>+			 : "memory", "cc");				      \
>      else if (sizeof (*mem) == 4)					      \
>-       __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1"		      \
>-			 : "=m" (*mem), "=qm" (__result)		      \
>-			 : "ir" (value), "m" (*mem));			      \
>+       __asm __volatile (lock #op "l %0, %1"				      \
>+			 : "=r" (__ret), "=m" (*mem)			      \
>+			 : "0" (arg), "m" (*mem)			      \
>+			 : "memory", "cc");				      \
>      else if (__HAVE_64B_ATOMICS)					      \
>-       __asm __volatile (LOCK_PREFIX "addq %q2, %0; sets %1"		      \
>-			 : "=m" (*mem), "=qm" (__result)		      \
>-			 : "ir" ((int64_t) cast_to_integer (value)),	      \
>-			   "m" (*mem));					      \
>+       __asm __volatile (lock #op "q %q0, %1"				      \
>+			 : "=r" (__ret), "=m" (*mem)			      \
>+			 : "0" ((int64_t) cast_to_integer (arg)),	      \
>+			   "m" (*mem)					      \
>+			 : "memory", "cc");				      \
>      else								      \
>        __atomic_link_error ();						      \
>-     __result; })
>-
>+     __ret; })
>
>-#define atomic_add_zero(mem, value) \
>-  ({ unsigned char __result;						      \
>+#define __single_op(lock, mem, op)					      \
>+  ({									      \
>      if (sizeof (*mem) == 1)						      \
>-       __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1"		      \
>-			 : "=m" (*mem), "=qm" (__result)		      \
>-			 : IBR_CONSTRAINT (value), "m" (*mem));		      \
>+       __asm __volatile (lock #op "b %b0"				      \
>+			 : "=m" (*mem)					      \
>+			 : "m" (*mem)					      \
>+			 : "memory", "cc");				      \
>      else if (sizeof (*mem) == 2)					      \
>-       __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1"		      \
>-			 : "=m" (*mem), "=qm" (__result)		      \
>-			 : "ir" (value), "m" (*mem));			      \
>+       __asm __volatile (lock #op "w %b0"				      \
>+			 : "=m" (*mem)					      \
>+			 : "m" (*mem)					      \
>+			 : "memory", "cc");				      \
>      else if (sizeof (*mem) == 4)					      \
>-       __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1"		      \
>-			 : "=m" (*mem), "=qm" (__result)		      \
>-			 : "ir" (value), "m" (*mem));			      \
>+       __asm __volatile (lock #op "l %b0"				      \
>+			 : "=m" (*mem)					      \
>+			 : "m" (*mem)					      \
>+			 : "memory", "cc");				      \
>      else if (__HAVE_64B_ATOMICS)					      \
>-       __asm __volatile (LOCK_PREFIX "addq %q2, %0; setz %1"		      \
>-			 : "=m" (*mem), "=qm" (__result)		      \
>-			 : "ir" ((int64_t) cast_to_integer (value)),	      \
>-			   "m" (*mem));					      \
>+       __asm __volatile (lock #op "q %b0"				      \
>+			 : "=m" (*mem)					      \
>+			 : "m" (*mem)					      \
>+			 : "memory", "cc");				      \
>      else								      \
>-       __atomic_link_error ();					      \
>-     __result; })
>+       __atomic_link_error ();						      \
>+  })
>
>+/* Note that we need no lock prefix.  */
>+#define atomic_exchange_acq(mem, newvalue)				      \
>+  __xchg_op ("", (mem), (newvalue), xchg)
>
>-#define __arch_increment_body(lock, pfx, mem) \
>-  do {									      \
>-    if (sizeof (*mem) == 1)						      \
>-      __asm __volatile (lock "incb %b0"					      \
>-			: "=m" (*mem)					      \
>-			: "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 2)					      \
>-      __asm __volatile (lock "incw %w0"					      \
>-			: "=m" (*mem)					      \
>-			: "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 4)					      \
>-      __asm __volatile (lock "incl %0"					      \
>-			: "=m" (*mem)					      \
>-			: "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (__HAVE_64B_ATOMICS)					      \
>-      __asm __volatile (lock "incq %q0"					      \
>-			: "=m" (*mem)					      \
>-			: "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else								      \
>-      do_add_val_64_acq (pfx, mem, 1);					      \
>-  } while (0)
>+#define atomic_add(mem, value) \
>+  __xchg_op (LOCK_PREFIX, (mem), (value), add);				      \
>
>-#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
>+#define catomic_add(mem, value)						      \
>+  ({									      \
>+    if (SINGLE_THREAD_P)						      \
>+      __xchg_op ("", (mem), (value), add);				      \
>+   else									      \
>+     atomic_add (mem, value);						      \
>+  })
>
>-#define __arch_increment_cprefix \
>-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
>
>-#define catomic_increment(mem) \
>-  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
>+#define atomic_increment(mem) \
>+  __single_op (LOCK_PREFIX, (mem), inc)
>
>+#define catomic_increment(mem)						      \
>+  ({									      \
>+    if (SINGLE_THREAD_P)						      \
>+      __single_op ("", (mem), inc);					      \
>+   else									      \
>+     atomic_increment (mem);						      \
>+  })
>
> #define atomic_increment_and_test(mem) \
>   ({ unsigned char __result;						      \
>@@ -357,43 +210,20 @@
> 			 : "=m" (*mem), "=qm" (__result)		      \
> 			 : "m" (*mem));					      \
>      else								      \
>-       __atomic_link_error ();					      \
>+       __atomic_link_error ();						      \
>      __result; })
>
>
>-#define __arch_decrement_body(lock, pfx, mem) \
>-  do {									      \
>-    if (sizeof (*mem) == 1)						      \
>-      __asm __volatile (lock "decb %b0"					      \
>-			: "=m" (*mem)					      \
>-			: "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 2)					      \
>-      __asm __volatile (lock "decw %w0"					      \
>-			: "=m" (*mem)					      \
>-			: "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 4)					      \
>-      __asm __volatile (lock "decl %0"					      \
>-			: "=m" (*mem)					      \
>-			: "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (__HAVE_64B_ATOMICS)					      \
>-      __asm __volatile (lock "decq %q0"					      \
>-			: "=m" (*mem)					      \
>-			: "m" (*mem),					      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else								      \
>-      do_add_val_64_acq (pfx, mem, -1);					      \
>-  } while (0)
>-
>-#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
>+#define atomic_decrement(mem)						      \
>+  __single_op (LOCK_PREFIX, (mem), dec)
>
>-#define __arch_decrement_cprefix \
>-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
>-
>-#define catomic_decrement(mem) \
>-  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
>+#define catomic_decrement(mem)						      \
>+  ({									      \
>+    if (SINGLE_THREAD_P)						      \
>+      __single_op ("", (mem), dec);					      \
>+   else									      \
>+     atomic_decrement (mem);						      \
>+  })
>
>
> #define atomic_decrement_and_test(mem) \
>@@ -463,73 +293,31 @@
> 			 : "=q" (__result), "=m" (*mem)			      \
> 			 : "m" (*mem), "ir" (bit));			      \
>      else							      	      \
>-       __atomic_link_error ();					      \
>+       __atomic_link_error ();						      \
>      __result; })
>
>
>-#define __arch_and_body(lock, mem, mask) \
>-  do {									      \
>-    if (sizeof (*mem) == 1)						      \
>-      __asm __volatile (lock "andb %b1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 2)					      \
>-      __asm __volatile (lock "andw %w1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" (mask), "m" (*mem),			      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 4)					      \
>-      __asm __volatile (lock "andl %1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" (mask), "m" (*mem),			      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (__HAVE_64B_ATOMICS)					      \
>-      __asm __volatile (lock "andq %q1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" (mask), "m" (*mem),			      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else								      \
>-      __atomic_link_error ();						      \
>-  } while (0)
>-
>-#define __arch_cprefix \
>-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
>-
>-#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
>-
>-#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
>+#define atomic_and(mem, mask)						      \
>+  __xchg_op (LOCK_PREFIX, (mem), (mask), and)
>
>+#define catomic_and(mem, mask) \
>+  ({									      \
>+    if (SINGLE_THREAD_P)						      \
>+      __xchg_op ("", (mem), (mask), and);				      \
>+   else									      \
>+      atomic_and (mem, mask);						      \
>+  })
>
>-#define __arch_or_body(lock, mem, mask) \
>-  do {									      \
>-    if (sizeof (*mem) == 1)						      \
>-      __asm __volatile (lock "orb %b1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 2)					      \
>-      __asm __volatile (lock "orw %w1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" (mask), "m" (*mem),			      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (sizeof (*mem) == 4)					      \
>-      __asm __volatile (lock "orl %1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" (mask), "m" (*mem),			      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else if (__HAVE_64B_ATOMICS)					      \
>-      __asm __volatile (lock "orq %q1, %0"				      \
>-			: "=m" (*mem)					      \
>-			: "ir" (mask), "m" (*mem),			      \
>-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
>-    else								      \
>-      __atomic_link_error ();						      \
>-  } while (0)
>-
>-#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
>+#define atomic_or(mem, mask)						      \
>+  __xchg_op (LOCK_PREFIX, (mem), (mask), or)
>
>-#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
>+#define catomic_or(mem, mask) \
>+  ({									      \
>+    if (SINGLE_THREAD_P)						      \
>+      __xchg_op ("", (mem), (mask), or);				      \
>+   else									      \
>+      atomic_or (mem, mask);						      \
>+  })
>
> /* We don't use mfence because it is supposedly slower due to having to
>    provide stronger guarantees (e.g., regarding self-modifying code).  */
>diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
>index 2bbd563a6c..8ec55a7ea8 100644
>--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
>+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
>@@ -9,7 +9,6 @@ CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
> CLEANUP			offsetof (struct pthread, cleanup)
> CLEANUP_PREV		offsetof (struct _pthread_cleanup_buffer, __prev)
> MUTEX_FUTEX		offsetof (pthread_mutex_t, __data.__lock)
>-MULTIPLE_THREADS_OFFSET	offsetof (tcbhead_t, multiple_threads)
> POINTER_GUARD		offsetof (tcbhead_t, pointer_guard)
> FEATURE_1_OFFSET	offsetof (tcbhead_t, feature_1)
> SSP_BASE_OFFSET		offsetof (tcbhead_t, ssp_base)
>-- 
>2.34.1
>
Wilco Dijkstra June 16, 2022, 12:48 p.m. UTC | #8
Hi Adhemerval,

>> For use in acquire/release atomics, it is required since code hoisting and other
>> optimizations must be prevented. So the old implementation was buggy, and this
>> is why we need to remove these target specific hacks.
>
> Yes, I noted this checking out the Linux kernel implementation. Although I am not
> sure if it really matter since we already have a volatile asm to should prevent code
> hoisting. 

Yes it really does matter - volatile asm does not block any optimizations across it.
This example shows how it fails:

int x, y;
int g(void)
{
  y = 3;
  //__atomic_fetch_add (&x, 1, __ATOMIC_ACQUIRE);
  asm volatile ("lock add %1, 1" : "+m" (x) ::  );
  return x + y;
}

The value of y propagates across the acquire without reloading it:

        mov     DWORD PTR y[rip], 3
        lock add DWORD PTR x[rip], 1
        mov     eax, DWORD PTR x[rip]
        add     eax, 3      // bug - no reload of y!!!
        ret

With the atomic or "memory" constraint we get the correct:

        mov     DWORD PTR y[rip], 3
        lock add        DWORD PTR x[rip], 1
        mov     eax, DWORD PTR y[rip]
        add     eax, DWORD PTR x[rip]
        ret

>> Any single-threaded optimizations should be done on a much higher level and only where
>> there is a clear performance gain. So we should get rid of all the atomic-machine headers.
>
> Complete agree, I have started to clean up by first moving some architectures to use
> compiler builtins [1]. I will check which are the architectures that still don’t use compiler
> builtin and see if we move them.

Yes I think it should be possible to move everything to use USE_ATOMIC_COMPILER_BUILTINS.
However targets that already use it still have a significant amount of atomic macros.

Cheers,
Wilco
Adhemerval Zanella Netto June 16, 2022, 5:23 p.m. UTC | #9
> On 16 Jun 2022, at 05:48, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> 
> Hi Adhemerval,
> 
>>> For use in acquire/release atomics, it is required since code hoisting and other
>>> optimizations must be prevented. So the old implementation was buggy, and this
>>> is why we need to remove these target specific hacks.
>> 
>> Yes, I noted this checking out the Linux kernel implementation. Although I am not
>> sure if it really matter since we already have a volatile asm to should prevent code
>> hoisting. 
> 
> Yes it really does matter - volatile asm does not block any optimizations across it.
> This example shows how it fails:
> 
> int x, y;
> int g(void)
> {
>   y = 3;
>   //__atomic_fetch_add (&x, 1, __ATOMIC_ACQUIRE);
>   asm volatile ("lock add %1, 1" : "+m" (x) ::  );
>   return x + y;
> }
> 
> The value of y propagates across the acquire without reloading it:
> 
>        mov     DWORD PTR y[rip], 3
>        lock add DWORD PTR x[rip], 1
>        mov     eax, DWORD PTR x[rip]
>        add     eax, 3      // bug - no reload of y!!!
>        ret
> 
> With the atomic or "memory" constraint we get the correct:
> 
>        mov     DWORD PTR y[rip], 3
>        lock add        DWORD PTR x[rip], 1
>        mov     eax, DWORD PTR y[rip]
>        add     eax, DWORD PTR x[rip]
>        ret

Interesting, a good point to move away from rei-implement atomics operations
now that we have proper compiler support (specially now that we don’t support
tricky ABI like sparcv7).

> 
>>> Any single-threaded optimizations should be done on a much higher level and only where
>>> there is a clear performance gain. So we should get rid of all the atomic-machine headers.
>> 
>> Complete agree, I have started to clean up by first moving some architectures to use
>> compiler builtins [1]. I will check which are the architectures that still don’t use compiler
>> builtin and see if we move them.
> 
> Yes I think it should be possible to move everything to use USE_ATOMIC_COMPILER_BUILTINS.
> However targets that already use it still have a significant amount of atomic macros.

The next step will be to consolidate all the atomics macros on the generic atomic.h.
diff mbox series

Patch

diff --git a/misc/tst-atomic.c b/misc/tst-atomic.c
index 6d681a7bfd..ddbc618e25 100644
--- a/misc/tst-atomic.c
+++ b/misc/tst-atomic.c
@@ -18,6 +18,7 @@ 
 
 #include <stdio.h>
 #include <atomic.h>
+#include <support/xthread.h>
 
 #ifndef atomic_t
 # define atomic_t int
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 98f5f6dd85..3e0d01cb52 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -290,9 +290,6 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 	 stack cache nor will the memory (except the TLS memory) be freed.  */
       pd->user_stack = true;
 
-      /* This is at least the second thread.  */
-      pd->header.multiple_threads = 1;
-
 #ifdef NEED_DL_SYSINFO
       SETUP_THREAD_SYSINFO (pd);
 #endif
@@ -408,9 +405,6 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 	     descriptor.  */
 	  pd->specific[0] = pd->specific_1stblock;
 
-	  /* This is at least the second thread.  */
-	  pd->header.multiple_threads = 1;
-
 #ifdef NEED_DL_SYSINFO
 	  SETUP_THREAD_SYSINFO (pd);
 #endif
diff --git a/nptl/descr.h b/nptl/descr.h
index bb46b5958e..77b25d8267 100644
--- a/nptl/descr.h
+++ b/nptl/descr.h
@@ -137,22 +137,7 @@  struct pthread
 #else
     struct
     {
-      /* multiple_threads is enabled either when the process has spawned at
-	 least one thread or when a single-threaded process cancels itself.
-	 This enables additional code to introduce locking before doing some
-	 compare_and_exchange operations and also enable cancellation points.
-	 The concepts of multiple threads and cancellation points ideally
-	 should be separate, since it is not necessary for multiple threads to
-	 have been created for cancellation points to be enabled, as is the
-	 case is when single-threaded process cancels itself.
-
-	 Since enabling multiple_threads enables additional code in
-	 cancellation points and compare_and_exchange operations, there is a
-	 potential for an unneeded performance hit when it is enabled in a
-	 single-threaded, self-canceling process.  This is OK though, since a
-	 single-threaded process will enable async cancellation only when it
-	 looks to cancel itself and is hence going to end anyway.  */
-      int multiple_threads;
+      int unused_multiple_threads;
       int gscope_flag;
     } header;
 #endif
diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
index e1735279f2..6d26a15d0e 100644
--- a/nptl/pthread_cancel.c
+++ b/nptl/pthread_cancel.c
@@ -157,12 +157,9 @@  __pthread_cancel (pthread_t th)
 
 	/* A single-threaded process should be able to kill itself, since
 	   there is nothing in the POSIX specification that says that it
-	   cannot.  So we set multiple_threads to true so that cancellation
-	   points get executed.  */
-	THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
-#ifndef TLS_MULTIPLE_THREADS_IN_TCB
+	   cannot.  So we set __libc_single_threaded to true so that
+	   cancellation points get executed.  */
 	__libc_single_threaded = 0;
-#endif
     }
   while (!atomic_compare_exchange_weak_acquire (&pd->cancelhandling, &oldval,
 						newval));
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 5633d01c62..d43865352f 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -882,11 +882,6 @@  __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 	   other reason that create_thread chose.  Now let it run
 	   free.  */
 	lll_unlock (pd->lock, LLL_PRIVATE);
-
-      /* We now have for sure more than one thread.  The main thread might
-	 not yet have the flag set.  No need to set the global variable
-	 again if this is what we use.  */
-      THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
     }
 
  out:
diff --git a/sysdeps/i386/htl/tcb-offsets.sym b/sysdeps/i386/htl/tcb-offsets.sym
index 7b7c719369..f3f7df6c06 100644
--- a/sysdeps/i386/htl/tcb-offsets.sym
+++ b/sysdeps/i386/htl/tcb-offsets.sym
@@ -2,7 +2,6 @@ 
 #include <tls.h>
 #include <kernel-features.h>
 
-MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads)
 SYSINFO_OFFSET          offsetof (tcbhead_t, sysinfo)
 POINTER_GUARD           offsetof (tcbhead_t, pointer_guard)
 SIGSTATE_OFFSET         offsetof (tcbhead_t, _hurd_sigstate)
diff --git a/sysdeps/i386/nptl/tcb-offsets.sym b/sysdeps/i386/nptl/tcb-offsets.sym
index 2ec9e787c1..1efd1469d8 100644
--- a/sysdeps/i386/nptl/tcb-offsets.sym
+++ b/sysdeps/i386/nptl/tcb-offsets.sym
@@ -6,7 +6,6 @@  RESULT			offsetof (struct pthread, result)
 TID			offsetof (struct pthread, tid)
 CANCELHANDLING		offsetof (struct pthread, cancelhandling)
 CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
-MULTIPLE_THREADS_OFFSET	offsetof (tcbhead_t, multiple_threads)
 SYSINFO_OFFSET		offsetof (tcbhead_t, sysinfo)
 CLEANUP			offsetof (struct pthread, cleanup)
 CLEANUP_PREV		offsetof (struct _pthread_cleanup_buffer, __prev)
diff --git a/sysdeps/i386/nptl/tls.h b/sysdeps/i386/nptl/tls.h
index 91090bf287..48940a9f44 100644
--- a/sysdeps/i386/nptl/tls.h
+++ b/sysdeps/i386/nptl/tls.h
@@ -36,7 +36,7 @@  typedef struct
 			   thread descriptor used by libpthread.  */
   dtv_t *dtv;
   void *self;		/* Pointer to the thread descriptor.  */
-  int multiple_threads;
+  int unused_multiple_threads;
   uintptr_t sysinfo;
   uintptr_t stack_guard;
   uintptr_t pointer_guard;
@@ -57,8 +57,6 @@  typedef struct
 _Static_assert (offsetof (tcbhead_t, __private_ss) == 0x30,
 		"offset of __private_ss != 0x30");
 
-# define TLS_MULTIPLE_THREADS_IN_TCB 1
-
 #else /* __ASSEMBLER__ */
 # include <tcb-offsets.h>
 #endif
diff --git a/sysdeps/ia64/nptl/tcb-offsets.sym b/sysdeps/ia64/nptl/tcb-offsets.sym
index b01f712be2..ab2cb180f9 100644
--- a/sysdeps/ia64/nptl/tcb-offsets.sym
+++ b/sysdeps/ia64/nptl/tcb-offsets.sym
@@ -2,5 +2,4 @@ 
 #include <tls.h>
 
 TID			offsetof (struct pthread, tid) - TLS_PRE_TCB_SIZE
-MULTIPLE_THREADS_OFFSET offsetof (struct pthread, header.multiple_threads) - TLS_PRE_TCB_SIZE
 SYSINFO_OFFSET		offsetof (tcbhead_t, __private)
diff --git a/sysdeps/ia64/nptl/tls.h b/sysdeps/ia64/nptl/tls.h
index 8ccedb73e6..008e080fc4 100644
--- a/sysdeps/ia64/nptl/tls.h
+++ b/sysdeps/ia64/nptl/tls.h
@@ -36,8 +36,6 @@  typedef struct
 
 register struct pthread *__thread_self __asm__("r13");
 
-# define TLS_MULTIPLE_THREADS_IN_TCB 1
-
 #else /* __ASSEMBLER__ */
 # include <tcb-offsets.h>
 #endif
diff --git a/sysdeps/mach/hurd/i386/tls.h b/sysdeps/mach/hurd/i386/tls.h
index 264ed9a9c5..d33e91c922 100644
--- a/sysdeps/mach/hurd/i386/tls.h
+++ b/sysdeps/mach/hurd/i386/tls.h
@@ -33,7 +33,7 @@  typedef struct
   void *tcb;			/* Points to this structure.  */
   dtv_t *dtv;			/* Vector of pointers to TLS data.  */
   thread_t self;		/* This thread's control port.  */
-  int multiple_threads;
+  int unused_multiple_threads;
   uintptr_t sysinfo;
   uintptr_t stack_guard;
   uintptr_t pointer_guard;
@@ -117,8 +117,6 @@  _hurd_tls_init (tcbhead_t *tcb)
   /* This field is used by TLS accesses to get our "thread pointer"
      from the TLS point of view.  */
   tcb->tcb = tcb;
-  /* We always at least start the sigthread anyway.  */
-  tcb->multiple_threads = 1;
 
   /* Get the first available selector.  */
   int sel = -1;
diff --git a/sysdeps/nios2/nptl/tcb-offsets.sym b/sysdeps/nios2/nptl/tcb-offsets.sym
index 3cd8d984ac..93a695ac7f 100644
--- a/sysdeps/nios2/nptl/tcb-offsets.sym
+++ b/sysdeps/nios2/nptl/tcb-offsets.sym
@@ -8,6 +8,5 @@ 
 # define __thread_self          ((void *) 0)
 # define thread_offsetof(mem)   ((ptrdiff_t) THREAD_SELF + offsetof (struct pthread, mem))
 
-MULTIPLE_THREADS_OFFSET		thread_offsetof (header.multiple_threads)
 TID_OFFSET			thread_offsetof (tid)
 POINTER_GUARD			(offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
diff --git a/sysdeps/or1k/nptl/tls.h b/sysdeps/or1k/nptl/tls.h
index c6ffe62c3f..3bb07beef8 100644
--- a/sysdeps/or1k/nptl/tls.h
+++ b/sysdeps/or1k/nptl/tls.h
@@ -35,8 +35,6 @@  typedef struct
 
 register tcbhead_t *__thread_self __asm__("r10");
 
-# define TLS_MULTIPLE_THREADS_IN_TCB 1
-
 /* Get system call information.  */
 # include <sysdep.h>
 
diff --git a/sysdeps/powerpc/nptl/tcb-offsets.sym b/sysdeps/powerpc/nptl/tcb-offsets.sym
index 4c01615ad0..a0ee95f94d 100644
--- a/sysdeps/powerpc/nptl/tcb-offsets.sym
+++ b/sysdeps/powerpc/nptl/tcb-offsets.sym
@@ -10,9 +10,6 @@ 
 # define thread_offsetof(mem)	((ptrdiff_t) THREAD_SELF + offsetof (struct pthread, mem))
 
 
-#if TLS_MULTIPLE_THREADS_IN_TCB
-MULTIPLE_THREADS_OFFSET		thread_offsetof (header.multiple_threads)
-#endif
 TID				thread_offsetof (tid)
 POINTER_GUARD			(offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
 TAR_SAVE			(offsetof (tcbhead_t, tar_save) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
index 22b0075235..fd5ee51981 100644
--- a/sysdeps/powerpc/nptl/tls.h
+++ b/sysdeps/powerpc/nptl/tls.h
@@ -52,9 +52,6 @@ 
 # define TLS_DTV_AT_TP	1
 # define TLS_TCB_AT_TP	0
 
-/* We use the multiple_threads field in the pthread struct */
-#define TLS_MULTIPLE_THREADS_IN_TCB	1
-
 /* Get the thread descriptor definition.  */
 # include <nptl/descr.h>
 
diff --git a/sysdeps/s390/nptl/tcb-offsets.sym b/sysdeps/s390/nptl/tcb-offsets.sym
index 9c1c01f353..bc7b267463 100644
--- a/sysdeps/s390/nptl/tcb-offsets.sym
+++ b/sysdeps/s390/nptl/tcb-offsets.sym
@@ -1,6 +1,5 @@ 
 #include <sysdep.h>
 #include <tls.h>
 
-MULTIPLE_THREADS_OFFSET		offsetof (tcbhead_t, multiple_threads)
 STACK_GUARD			offsetof (tcbhead_t, stack_guard)
 TID				offsetof (struct pthread, tid)
diff --git a/sysdeps/s390/nptl/tls.h b/sysdeps/s390/nptl/tls.h
index ff210ffeb2..d69ed539f7 100644
--- a/sysdeps/s390/nptl/tls.h
+++ b/sysdeps/s390/nptl/tls.h
@@ -35,7 +35,7 @@  typedef struct
 			   thread descriptor used by libpthread.  */
   dtv_t *dtv;
   void *self;		/* Pointer to the thread descriptor.  */
-  int multiple_threads;
+  int unused_multiple_threads;
   uintptr_t sysinfo;
   uintptr_t stack_guard;
   int gscope_flag;
@@ -44,10 +44,6 @@  typedef struct
   void *__private_ss;
 } tcbhead_t;
 
-# ifndef __s390x__
-#  define TLS_MULTIPLE_THREADS_IN_TCB 1
-# endif
-
 #else /* __ASSEMBLER__ */
 # include <tcb-offsets.h>
 #endif
diff --git a/sysdeps/sh/nptl/tcb-offsets.sym b/sysdeps/sh/nptl/tcb-offsets.sym
index 234207779d..4e452d9c6c 100644
--- a/sysdeps/sh/nptl/tcb-offsets.sym
+++ b/sysdeps/sh/nptl/tcb-offsets.sym
@@ -6,7 +6,6 @@  RESULT			offsetof (struct pthread, result)
 TID			offsetof (struct pthread, tid)
 CANCELHANDLING		offsetof (struct pthread, cancelhandling)
 CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
-MULTIPLE_THREADS_OFFSET	offsetof (struct pthread, header.multiple_threads)
 TLS_PRE_TCB_SIZE	sizeof (struct pthread)
 MUTEX_FUTEX		offsetof (pthread_mutex_t, __data.__lock)
 POINTER_GUARD		offsetof (tcbhead_t, pointer_guard)
diff --git a/sysdeps/sh/nptl/tls.h b/sysdeps/sh/nptl/tls.h
index 76591ab6ef..8778cb4ac0 100644
--- a/sysdeps/sh/nptl/tls.h
+++ b/sysdeps/sh/nptl/tls.h
@@ -36,8 +36,6 @@  typedef struct
   uintptr_t pointer_guard;
 } tcbhead_t;
 
-# define TLS_MULTIPLE_THREADS_IN_TCB 1
-
 #else /* __ASSEMBLER__ */
 # include <tcb-offsets.h>
 #endif /* __ASSEMBLER__ */
diff --git a/sysdeps/sparc/nptl/tcb-offsets.sym b/sysdeps/sparc/nptl/tcb-offsets.sym
index f75d02065e..e4a7e4720f 100644
--- a/sysdeps/sparc/nptl/tcb-offsets.sym
+++ b/sysdeps/sparc/nptl/tcb-offsets.sym
@@ -1,6 +1,5 @@ 
 #include <sysdep.h>
 #include <tls.h>
 
-MULTIPLE_THREADS_OFFSET		offsetof (tcbhead_t, multiple_threads)
 POINTER_GUARD			offsetof (tcbhead_t, pointer_guard)
 TID				offsetof (struct pthread, tid)
diff --git a/sysdeps/sparc/nptl/tls.h b/sysdeps/sparc/nptl/tls.h
index d1e2bb4ad1..b78cf0d6b4 100644
--- a/sysdeps/sparc/nptl/tls.h
+++ b/sysdeps/sparc/nptl/tls.h
@@ -35,7 +35,7 @@  typedef struct
 			   thread descriptor used by libpthread.  */
   dtv_t *dtv;
   void *self;
-  int multiple_threads;
+  int unused_multiple_threads;
 #if __WORDSIZE == 64
   int gscope_flag;
 #endif
diff --git a/sysdeps/unix/sysv/linux/single-thread.h b/sysdeps/unix/sysv/linux/single-thread.h
index 208edccce6..dd80e82c82 100644
--- a/sysdeps/unix/sysv/linux/single-thread.h
+++ b/sysdeps/unix/sysv/linux/single-thread.h
@@ -23,20 +23,7 @@ 
 # include <sys/single_threaded.h>
 #endif
 
-/* The default way to check if the process is single thread is by using the
-   pthread_t 'multiple_threads' field.  However, for some architectures it is
-   faster to either use an extra field on TCB or global variables (the TCB
-   field is also used on x86 for some single-thread atomic optimizations).
-
-   The ABI might define SINGLE_THREAD_BY_GLOBAL to enable the single thread
-   check to use global variables instead of the pthread_t field.  */
-
-#if !defined SINGLE_THREAD_BY_GLOBAL || IS_IN (rtld)
-# define SINGLE_THREAD_P \
-  (THREAD_GETMEM (THREAD_SELF, header.multiple_threads) == 0)
-#else
-# define SINGLE_THREAD_P (__libc_single_threaded != 0)
-#endif
+#define SINGLE_THREAD_P (__libc_single_threaded != 0)
 
 #define RTLD_SINGLE_THREAD_P SINGLE_THREAD_P
 
diff --git a/sysdeps/x86/atomic-machine.h b/sysdeps/x86/atomic-machine.h
index f24f1c71ed..23e087e7e0 100644
--- a/sysdeps/x86/atomic-machine.h
+++ b/sysdeps/x86/atomic-machine.h
@@ -51,292 +51,145 @@ 
 #define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \
   (! __sync_bool_compare_and_swap (mem, oldval, newval))
 
+#define __cmpxchg_op(lock, mem, newval, oldval)				      \
+  ({ __typeof (*mem) __ret;						      \
+     if (sizeof (*mem) == 1)						      \
+       asm volatile (lock "cmpxchgb %2, %1"				      \
+		     : "=a" (ret), "+m" (*mem)				      \
+		     : BR_CONSTRAINT (newval), "0" (oldval)	  	      \
+		     : "memory");					      \
+     else if (sizeof (*mem) == 2)					      \
+       asm volatile (lock "cmpxchgw %2, %1"				      \
+		     : "=a" (ret), "+m" (*mem)				      \
+		     : BR_CONSTRAINT (newval), "0" (oldval)	  	      \
+		     : "memory");					      \
+     else if (sizeof (*mem) == 4)					      \
+       asm volatile (lock "cmpxchgl %2, %1"				      \
+		     : "=a" (ret), "+m" (*mem)				      \
+		     : BR_CONSTRAINT (newval), "0" (oldval)	  	      \
+		     : "memory");					      \
+     else if (__HAVE_64B_ATOMICS)					      \
+       asm volatile (lock "cmpxchgq %2, %1"				      \
+                    : "=a" (ret), "+m" (*mem)				      \
+                    : "q" ((int64_t) cast_to_integer (newval)),		      \
+                      "0" ((int64_t) cast_to_integer (oldval))		      \
+                    : "memory");					      \
+     else								      \
+       __atomic_link_error ();						      \
+     __ret; })
 
-#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \
+#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval)	      \
   ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgb %b2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     if (SINGLE_THREAD_P)						      \
+       __cmpxchg_op ("", (mem), (newval), (oldval));			      \
+     else								      \
+       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));		      \
      ret; })
 
-#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \
+#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval)	      \
   ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgw %w2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
+     if (SINGLE_THREAD_P)						      \
+       __cmpxchg_op ("", (mem), (newval), (oldval));			      \
+     else								      \
+       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));		      \
      ret; })
 
-#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \
+#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval)	      \
   ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%" SEG_REG ":%P5\n\t"			      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgl %2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : BR_CONSTRAINT (newval), "m" (*mem), "0" (oldval),    \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));       \
+     if (SINGLE_THREAD_P)						      \
+       __cmpxchg_op ("", (mem), (newval), (oldval));			      \
+     else								      \
+       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));		      \
      ret; })
 
-#ifdef __x86_64__
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
+#define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
   ({ __typeof (*mem) ret;						      \
-     __asm __volatile ("cmpl $0, %%fs:%P5\n\t"				      \
-		       "je 0f\n\t"					      \
-		       "lock\n"						      \
-		       "0:\tcmpxchgq %q2, %1"				      \
-		       : "=a" (ret), "=m" (*mem)			      \
-		       : "q" ((int64_t) cast_to_integer (newval)),	      \
-			 "m" (*mem),					      \
-			 "0" ((int64_t) cast_to_integer (oldval)),	      \
-			 "i" (offsetof (tcbhead_t, multiple_threads)));	      \
-     ret; })
-# define do_exchange_and_add_val_64_acq(pfx, mem, value) 0
-# define do_add_val_64_acq(pfx, mem, value) do { } while (0)
-#else
-/* XXX We do not really need 64-bit compare-and-exchange.  At least
-   not in the moment.  Using it would mean causing portability
-   problems since not many other 32-bit architectures have support for
-   such an operation.  So don't define any code for now.  If it is
-   really going to be used the code below can be used on Intel Pentium
-   and later, but NOT on i486.  */
-# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \
-  ({ __typeof (*mem) ret = *(mem);					      \
-     __atomic_link_error ();						      \
-     ret = (newval);							      \
-     ret = (oldval);							      \
-     ret; })
-
-# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval)	      \
-  ({ __typeof (*mem) ret = *(mem);					      \
-     __atomic_link_error ();						      \
-     ret = (newval);							      \
-     ret = (oldval);							      \
-     ret; })
-
-# define do_exchange_and_add_val_64_acq(pfx, mem, value) \
-  ({ __typeof (value) __addval = (value);				      \
-     __typeof (*mem) __result;						      \
-     __typeof (mem) __memp = (mem);					      \
-     __typeof (*mem) __tmpval;						      \
-     __result = *__memp;						      \
-     do									      \
-       __tmpval = __result;						      \
-     while ((__result = pfx##_compare_and_exchange_val_64_acq		      \
-	     (__memp, __result + __addval, __result)) == __tmpval);	      \
-     __result; })
-
-# define do_add_val_64_acq(pfx, mem, value) \
-  {									      \
-    __typeof (value) __addval = (value);				      \
-    __typeof (mem) __memp = (mem);					      \
-    __typeof (*mem) __oldval = *__memp;					      \
-    __typeof (*mem) __tmpval;						      \
-    do									      \
-      __tmpval = __oldval;						      \
-    while ((__oldval = pfx##_compare_and_exchange_val_64_acq		      \
-	    (__memp, __oldval + __addval, __oldval)) == __tmpval);	      \
-  }
-#endif
-
-
-/* Note that we need no lock prefix.  */
-#define atomic_exchange_acq(mem, newvalue) \
-  ({ __typeof (*mem) result;						      \
-     if (sizeof (*mem) == 1)						      \
-       __asm __volatile ("xchgb %b0, %1"				      \
-			 : "=q" (result), "=m" (*mem)			      \
-			 : "0" (newvalue), "m" (*mem));			      \
-     else if (sizeof (*mem) == 2)					      \
-       __asm __volatile ("xchgw %w0, %1"				      \
-			 : "=r" (result), "=m" (*mem)			      \
-			 : "0" (newvalue), "m" (*mem));			      \
-     else if (sizeof (*mem) == 4)					      \
-       __asm __volatile ("xchgl %0, %1"					      \
-			 : "=r" (result), "=m" (*mem)			      \
-			 : "0" (newvalue), "m" (*mem));			      \
-     else if (__HAVE_64B_ATOMICS)					      \
-       __asm __volatile ("xchgq %q0, %1"				      \
-			 : "=r" (result), "=m" (*mem)			      \
-			 : "0" ((int64_t) cast_to_integer (newvalue)),        \
-			   "m" (*mem));					      \
-     else								      \
-       {								      \
-	 result = 0;							      \
-	 __atomic_link_error ();					      \
-       }								      \
-     result; })
-
-
-#define __arch_exchange_and_add_body(lock, pfx, mem, value) \
-  ({ __typeof (*mem) __result;						      \
-     __typeof (value) __addval = (value);				      \
-     if (sizeof (*mem) == 1)						      \
-       __asm __volatile (lock "xaddb %b0, %1"				      \
-			 : "=q" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
-     else if (sizeof (*mem) == 2)					      \
-       __asm __volatile (lock "xaddw %w0, %1"				      \
-			 : "=r" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
-     else if (sizeof (*mem) == 4)					      \
-       __asm __volatile (lock "xaddl %0, %1"				      \
-			 : "=r" (__result), "=m" (*mem)			      \
-			 : "0" (__addval), "m" (*mem),			      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
-     else if (__HAVE_64B_ATOMICS)					      \
-       __asm __volatile (lock "xaddq %q0, %1"				      \
-			 : "=r" (__result), "=m" (*mem)			      \
-			 : "0" ((int64_t) cast_to_integer (__addval)),     \
-			   "m" (*mem),					      \
-			   "i" (offsetof (tcbhead_t, multiple_threads)));     \
+     if (SINGLE_THREAD_P)						      \
+       __cmpxchg_op ("", (mem), (newval), (oldval));			      \
      else								      \
-       __result = do_exchange_and_add_val_64_acq (pfx, (mem), __addval);      \
-     __result; })
-
-#define atomic_exchange_and_add(mem, value) \
-  __sync_fetch_and_add (mem, value)
-
-#define __arch_exchange_and_add_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P4\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_exchange_and_add(mem, value) \
-  __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c,    \
-				mem, value)
-
-
-#define __arch_add_body(lock, pfx, apfx, mem, value) \
-  do {									      \
-    if (__builtin_constant_p (value) && (value) == 1)			      \
-      pfx##_increment (mem);						      \
-    else if (__builtin_constant_p (value) && (value) == -1)		      \
-      pfx##_decrement (mem);						      \
-    else if (sizeof (*mem) == 1)					      \
-      __asm __volatile (lock "addb %b1, %0"				      \
-			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (value), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 2)					      \
-      __asm __volatile (lock "addw %w1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" (value), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 4)					      \
-      __asm __volatile (lock "addl %1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" (value), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (__HAVE_64B_ATOMICS)					      \
-      __asm __volatile (lock "addq %q1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" ((int64_t) cast_to_integer (value)),	      \
-			  "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else								      \
-      do_add_val_64_acq (apfx, (mem), (value));				      \
-  } while (0)
-
-# define atomic_add(mem, value) \
-  __arch_add_body (LOCK_PREFIX, atomic, __arch, mem, value)
-
-#define __arch_add_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_add(mem, value) \
-  __arch_add_body (__arch_add_cprefix, atomic, __arch_c, mem, value)
+       __cmpxchg_op (LOCK_PREFIX, (mem), (newval), (oldval));		      \
+     ret; })
 
 
-#define atomic_add_negative(mem, value) \
-  ({ unsigned char __result;						      \
+#define __xchg_op(lock, mem, arg, op)					      \
+  ({ __typeof (*mem) __ret = (arg);					      \
      if (sizeof (*mem) == 1)						      \
-       __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1"		      \
-			 : "=m" (*mem), "=qm" (__result)		      \
-			 : IBR_CONSTRAINT (value), "m" (*mem));		      \
+       __asm __volatile (lock #op "b %b0, %1"				      \
+			 : "=q" (__ret), "=m" (*mem)			      \
+			 : "0" (arg), "m" (*mem)			      \
+			 : "memory", "cc");				      \
      else if (sizeof (*mem) == 2)					      \
-       __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1"		      \
-			 : "=m" (*mem), "=qm" (__result)		      \
-			 : "ir" (value), "m" (*mem));			      \
+       __asm __volatile (lock #op "w %w0, %1"				      \
+			 : "=r" (__ret), "=m" (*mem)			      \
+			 : "0" (arg), "m" (*mem)			      \
+			 : "memory", "cc");				      \
      else if (sizeof (*mem) == 4)					      \
-       __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1"		      \
-			 : "=m" (*mem), "=qm" (__result)		      \
-			 : "ir" (value), "m" (*mem));			      \
+       __asm __volatile (lock #op "l %0, %1"				      \
+			 : "=r" (__ret), "=m" (*mem)			      \
+			 : "0" (arg), "m" (*mem)			      \
+			 : "memory", "cc");				      \
      else if (__HAVE_64B_ATOMICS)					      \
-       __asm __volatile (LOCK_PREFIX "addq %q2, %0; sets %1"		      \
-			 : "=m" (*mem), "=qm" (__result)		      \
-			 : "ir" ((int64_t) cast_to_integer (value)),	      \
-			   "m" (*mem));					      \
+       __asm __volatile (lock #op "q %q0, %1"				      \
+			 : "=r" (__ret), "=m" (*mem)			      \
+			 : "0" ((int64_t) cast_to_integer (arg)),	      \
+			   "m" (*mem)					      \
+			 : "memory", "cc");				      \
      else								      \
        __atomic_link_error ();						      \
-     __result; })
-
+     __ret; })
 
-#define atomic_add_zero(mem, value) \
-  ({ unsigned char __result;						      \
+#define __single_op(lock, mem, op)					      \
+  ({									      \
      if (sizeof (*mem) == 1)						      \
-       __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1"		      \
-			 : "=m" (*mem), "=qm" (__result)		      \
-			 : IBR_CONSTRAINT (value), "m" (*mem));		      \
+       __asm __volatile (lock #op "b %b0"				      \
+			 : "=m" (*mem)					      \
+			 : "m" (*mem)					      \
+			 : "memory", "cc");				      \
      else if (sizeof (*mem) == 2)					      \
-       __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1"		      \
-			 : "=m" (*mem), "=qm" (__result)		      \
-			 : "ir" (value), "m" (*mem));			      \
+       __asm __volatile (lock #op "w %b0"				      \
+			 : "=m" (*mem)					      \
+			 : "m" (*mem)					      \
+			 : "memory", "cc");				      \
      else if (sizeof (*mem) == 4)					      \
-       __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1"		      \
-			 : "=m" (*mem), "=qm" (__result)		      \
-			 : "ir" (value), "m" (*mem));			      \
+       __asm __volatile (lock #op "l %b0"				      \
+			 : "=m" (*mem)					      \
+			 : "m" (*mem)					      \
+			 : "memory", "cc");				      \
      else if (__HAVE_64B_ATOMICS)					      \
-       __asm __volatile (LOCK_PREFIX "addq %q2, %0; setz %1"		      \
-			 : "=m" (*mem), "=qm" (__result)		      \
-			 : "ir" ((int64_t) cast_to_integer (value)),	      \
-			   "m" (*mem));					      \
+       __asm __volatile (lock #op "q %b0"				      \
+			 : "=m" (*mem)					      \
+			 : "m" (*mem)					      \
+			 : "memory", "cc");				      \
      else								      \
-       __atomic_link_error ();					      \
-     __result; })
+       __atomic_link_error ();						      \
+  })
 
+/* Note that we need no lock prefix.  */
+#define atomic_exchange_acq(mem, newvalue)				      \
+  __xchg_op ("", (mem), (newvalue), xchg)
 
-#define __arch_increment_body(lock, pfx, mem) \
-  do {									      \
-    if (sizeof (*mem) == 1)						      \
-      __asm __volatile (lock "incb %b0"					      \
-			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 2)					      \
-      __asm __volatile (lock "incw %w0"					      \
-			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 4)					      \
-      __asm __volatile (lock "incl %0"					      \
-			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (__HAVE_64B_ATOMICS)					      \
-      __asm __volatile (lock "incq %q0"					      \
-			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else								      \
-      do_add_val_64_acq (pfx, mem, 1);					      \
-  } while (0)
+#define atomic_add(mem, value) \
+  __xchg_op (LOCK_PREFIX, (mem), (value), add);				      \
 
-#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem)
+#define catomic_add(mem, value)						      \
+  ({									      \
+    if (SINGLE_THREAD_P)						      \
+      __xchg_op ("", (mem), (value), add);				      \
+   else									      \
+     atomic_add (mem, value);						      \
+  })
 
-#define __arch_increment_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
 
-#define catomic_increment(mem) \
-  __arch_increment_body (__arch_increment_cprefix, __arch_c, mem)
+#define atomic_increment(mem) \
+  __single_op (LOCK_PREFIX, (mem), inc)
 
+#define catomic_increment(mem)						      \
+  ({									      \
+    if (SINGLE_THREAD_P)						      \
+      __single_op ("", (mem), inc);					      \
+   else									      \
+     atomic_increment (mem);						      \
+  })
 
 #define atomic_increment_and_test(mem) \
   ({ unsigned char __result;						      \
@@ -357,43 +210,20 @@ 
 			 : "=m" (*mem), "=qm" (__result)		      \
 			 : "m" (*mem));					      \
      else								      \
-       __atomic_link_error ();					      \
+       __atomic_link_error ();						      \
      __result; })
 
 
-#define __arch_decrement_body(lock, pfx, mem) \
-  do {									      \
-    if (sizeof (*mem) == 1)						      \
-      __asm __volatile (lock "decb %b0"					      \
-			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 2)					      \
-      __asm __volatile (lock "decw %w0"					      \
-			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 4)					      \
-      __asm __volatile (lock "decl %0"					      \
-			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (__HAVE_64B_ATOMICS)					      \
-      __asm __volatile (lock "decq %q0"					      \
-			: "=m" (*mem)					      \
-			: "m" (*mem),					      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else								      \
-      do_add_val_64_acq (pfx, mem, -1);					      \
-  } while (0)
-
-#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem)
+#define atomic_decrement(mem)						      \
+  __single_op (LOCK_PREFIX, (mem), dec)
 
-#define __arch_decrement_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P2\n\tje 0f\n\tlock\n0:\t"
-
-#define catomic_decrement(mem) \
-  __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem)
+#define catomic_decrement(mem)						      \
+  ({									      \
+    if (SINGLE_THREAD_P)						      \
+      __single_op ("", (mem), dec);					      \
+   else									      \
+     atomic_decrement (mem);						      \
+  })
 
 
 #define atomic_decrement_and_test(mem) \
@@ -463,73 +293,31 @@ 
 			 : "=q" (__result), "=m" (*mem)			      \
 			 : "m" (*mem), "ir" (bit));			      \
      else							      	      \
-       __atomic_link_error ();					      \
+       __atomic_link_error ();						      \
      __result; })
 
 
-#define __arch_and_body(lock, mem, mask) \
-  do {									      \
-    if (sizeof (*mem) == 1)						      \
-      __asm __volatile (lock "andb %b1, %0"				      \
-			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 2)					      \
-      __asm __volatile (lock "andw %w1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 4)					      \
-      __asm __volatile (lock "andl %1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (__HAVE_64B_ATOMICS)					      \
-      __asm __volatile (lock "andq %q1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else								      \
-      __atomic_link_error ();						      \
-  } while (0)
-
-#define __arch_cprefix \
-  "cmpl $0, %%" SEG_REG ":%P3\n\tje 0f\n\tlock\n0:\t"
-
-#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask)
-
-#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask)
+#define atomic_and(mem, mask)						      \
+  __xchg_op (LOCK_PREFIX, (mem), (mask), and)
 
+#define catomic_and(mem, mask) \
+  ({									      \
+    if (SINGLE_THREAD_P)						      \
+      __xchg_op ("", (mem), (mask), and);				      \
+   else									      \
+      atomic_and (mem, mask);						      \
+  })
 
-#define __arch_or_body(lock, mem, mask) \
-  do {									      \
-    if (sizeof (*mem) == 1)						      \
-      __asm __volatile (lock "orb %b1, %0"				      \
-			: "=m" (*mem)					      \
-			: IBR_CONSTRAINT (mask), "m" (*mem),		      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 2)					      \
-      __asm __volatile (lock "orw %w1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (sizeof (*mem) == 4)					      \
-      __asm __volatile (lock "orl %1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else if (__HAVE_64B_ATOMICS)					      \
-      __asm __volatile (lock "orq %q1, %0"				      \
-			: "=m" (*mem)					      \
-			: "ir" (mask), "m" (*mem),			      \
-			  "i" (offsetof (tcbhead_t, multiple_threads)));      \
-    else								      \
-      __atomic_link_error ();						      \
-  } while (0)
-
-#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask)
+#define atomic_or(mem, mask)						      \
+  __xchg_op (LOCK_PREFIX, (mem), (mask), or)
 
-#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask)
+#define catomic_or(mem, mask) \
+  ({									      \
+    if (SINGLE_THREAD_P)						      \
+      __xchg_op ("", (mem), (mask), or);				      \
+   else									      \
+      atomic_or (mem, mask);						      \
+  })
 
 /* We don't use mfence because it is supposedly slower due to having to
    provide stronger guarantees (e.g., regarding self-modifying code).  */
diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
index 2bbd563a6c..8ec55a7ea8 100644
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -9,7 +9,6 @@  CLEANUP_JMP_BUF		offsetof (struct pthread, cleanup_jmp_buf)
 CLEANUP			offsetof (struct pthread, cleanup)
 CLEANUP_PREV		offsetof (struct _pthread_cleanup_buffer, __prev)
 MUTEX_FUTEX		offsetof (pthread_mutex_t, __data.__lock)
-MULTIPLE_THREADS_OFFSET	offsetof (tcbhead_t, multiple_threads)
 POINTER_GUARD		offsetof (tcbhead_t, pointer_guard)
 FEATURE_1_OFFSET	offsetof (tcbhead_t, feature_1)
 SSP_BASE_OFFSET		offsetof (tcbhead_t, ssp_base)