diff mbox series

nptl: Move stack list variables into _rtld_global

Message ID 87a6vlthqn.fsf@oldenburg2.str.redhat.com
State New
Headers show
Series nptl: Move stack list variables into _rtld_global | expand

Commit Message

Florian Weimer Nov. 13, 2020, 3:10 p.m. UTC
Now __thread_gscope_wait (the function behind THREAD_GSCOPE_WAIT,
formerly __wait_lookup_done) can be implemented directly in ld.so,
eliminating the unprotected GL (dl_wait_lookup_done) function
pointer.

---
 csu/libc-tls.c                       |   7 ++
 elf/Makefile                         |   6 +-
 elf/dl-support.c                     |   9 ++-
 elf/dl-thread_gscope_wait.c          |   2 +
 elf/rtld.c                           |  12 +++
 nptl/allocatestack.c                 | 147 +++++++++--------------------------
 nptl/descr.h                         |   3 +-
 nptl/nptl-init.c                     |  11 +--
 nptl/pthreadP.h                      |   4 -
 nptl/pthread_create.c                |   8 +-
 nptl_db/structs.def                  |   4 +-
 nptl_db/td_init.c                    |  14 ++++
 nptl_db/td_ta_map_lwp2thr.c          |   2 +-
 nptl_db/td_ta_thr_iter.c             |   4 +-
 nptl_db/td_thr_tlsbase.c             |   7 +-
 nptl_db/td_thr_validate.c            |  39 +++++++++-
 nptl_db/thread_dbP.h                 |  13 ++++
 sysdeps/aarch64/nptl/tls.h           |   2 -
 sysdeps/alpha/nptl/tls.h             |   2 -
 sysdeps/arc/nptl/tls.h               |   2 -
 sysdeps/arm/nptl/tls.h               |   2 -
 sysdeps/csky/nptl/tls.h              |   2 -
 sysdeps/generic/ldsodefs.h           |  19 ++++-
 sysdeps/hppa/nptl/tls.h              |   2 -
 sysdeps/i386/nptl/tls.h              |   2 -
 sysdeps/ia64/nptl/tls.h              |   2 -
 sysdeps/m68k/nptl/tls.h              |   2 -
 sysdeps/microblaze/nptl/tls.h        |   2 -
 sysdeps/mips/nptl/tls.h              |   2 -
 sysdeps/nios2/nptl/tls.h             |   2 -
 sysdeps/nptl/dl-thread_gscope_wait.c |  80 +++++++++++++++++++
 sysdeps/powerpc/nptl/tls.h           |   2 -
 sysdeps/riscv/nptl/tls.h             |   2 -
 sysdeps/s390/nptl/tls.h              |   2 -
 sysdeps/sh/nptl/tls.h                |   2 -
 sysdeps/sparc/nptl/tls.h             |   2 -
 sysdeps/x86_64/nptl/tls.h            |   2 -
 37 files changed, 243 insertions(+), 184 deletions(-)

Comments

Adhemerval Zanella Nov. 16, 2020, 6:02 p.m. UTC | #1
On 13/11/2020 12:10, Florian Weimer via Libc-alpha wrote:
> Now __thread_gscope_wait (the function behind THREAD_GSCOPE_WAIT,
> formerly __wait_lookup_done) can be implemented directly in ld.so,
> eliminating the unprotected GL (dl_wait_lookup_done) function
> pointer.

LGTM, thanks.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> 
> ---
>  csu/libc-tls.c                       |   7 ++
>  elf/Makefile                         |   6 +-
>  elf/dl-support.c                     |   9 ++-
>  elf/dl-thread_gscope_wait.c          |   2 +
>  elf/rtld.c                           |  12 +++
>  nptl/allocatestack.c                 | 147 +++++++++--------------------------
>  nptl/descr.h                         |   3 +-
>  nptl/nptl-init.c                     |  11 +--
>  nptl/pthreadP.h                      |   4 -
>  nptl/pthread_create.c                |   8 +-
>  nptl_db/structs.def                  |   4 +-
>  nptl_db/td_init.c                    |  14 ++++
>  nptl_db/td_ta_map_lwp2thr.c          |   2 +-
>  nptl_db/td_ta_thr_iter.c             |   4 +-
>  nptl_db/td_thr_tlsbase.c             |   7 +-
>  nptl_db/td_thr_validate.c            |  39 +++++++++-
>  nptl_db/thread_dbP.h                 |  13 ++++
>  sysdeps/aarch64/nptl/tls.h           |   2 -
>  sysdeps/alpha/nptl/tls.h             |   2 -
>  sysdeps/arc/nptl/tls.h               |   2 -
>  sysdeps/arm/nptl/tls.h               |   2 -
>  sysdeps/csky/nptl/tls.h              |   2 -
>  sysdeps/generic/ldsodefs.h           |  19 ++++-
>  sysdeps/hppa/nptl/tls.h              |   2 -
>  sysdeps/i386/nptl/tls.h              |   2 -
>  sysdeps/ia64/nptl/tls.h              |   2 -
>  sysdeps/m68k/nptl/tls.h              |   2 -
>  sysdeps/microblaze/nptl/tls.h        |   2 -
>  sysdeps/mips/nptl/tls.h              |   2 -
>  sysdeps/nios2/nptl/tls.h             |   2 -
>  sysdeps/nptl/dl-thread_gscope_wait.c |  80 +++++++++++++++++++
>  sysdeps/powerpc/nptl/tls.h           |   2 -
>  sysdeps/riscv/nptl/tls.h             |   2 -
>  sysdeps/s390/nptl/tls.h              |   2 -
>  sysdeps/sh/nptl/tls.h                |   2 -
>  sysdeps/sparc/nptl/tls.h             |   2 -
>  sysdeps/x86_64/nptl/tls.h            |   2 -
>  37 files changed, 243 insertions(+), 184 deletions(-)
> 
> diff --git a/csu/libc-tls.c b/csu/libc-tls.c
> index 06e76bd395..c3589f0a7d 100644
> --- a/csu/libc-tls.c
> +++ b/csu/libc-tls.c
> @@ -24,6 +24,7 @@
>  #include <stdio.h>
>  #include <sys/param.h>
>  #include <array_length.h>
> +#include <list.h>
>  
>  #ifdef SHARED
>   #error makefile bug, this file is for static only
> @@ -193,6 +194,12 @@ __libc_setup_tls (void)
>    if (__builtin_expect (lossage != NULL, 0))
>      _startup_fatal (lossage);
>  
> +#if THREAD_GSCOPE_IN_TCB
> +  INIT_LIST_HEAD (&_dl_stack_used);
> +  INIT_LIST_HEAD (&_dl_stack_user);
> +  list_add (&THREAD_SELF->list, &_dl_stack_user);
> +#endif
> +
>    /* Update the executable's link map with enough information to make
>       the TLS routines happy.  */
>    main_map->l_tls_align = align;

Ok.

> diff --git a/elf/Makefile b/elf/Makefile
> index f10cc59e7c..2015383eb2 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -34,7 +34,8 @@ dl-routines	= $(addprefix dl-,load lookup object reloc deps \
>  				  version profile tls origin scope \
>  				  execstack open close trampoline \
>  				  exception sort-maps lookup-direct \
> -				  call-libc-early-init write)
> +				  call-libc-early-init write \
> +				  thread_gscope_wait)
>  ifeq (yes,$(use-ldconfig))
>  dl-routines += dl-cache
>  endif

Ok.

> @@ -54,7 +55,8 @@ endif
>  all-dl-routines = $(dl-routines) $(sysdep-dl-routines)
>  # But they are absent from the shared libc, because that code is in ld.so.
>  elide-routines.os = $(all-dl-routines) dl-support enbl-secure dl-origin \
> -		    dl-sysdep dl-exception dl-reloc-static-pie
> +		    dl-sysdep dl-exception dl-reloc-static-pie \
> +		    thread_gscope_wait
>  
>  # ld.so uses those routines, plus some special stuff for being the program
>  # interpreter and operating independent of libc.

Ok.

> diff --git a/elf/dl-support.c b/elf/dl-support.c
> index afbc94df54..15e6d787c9 100644
> --- a/elf/dl-support.c
> +++ b/elf/dl-support.c
> @@ -183,10 +183,11 @@ ElfW(Word) _dl_stack_flags = DEFAULT_STACK_PERMS;
>  int (*_dl_make_stack_executable_hook) (void **) = _dl_make_stack_executable;
>  
>  
> -/* Function in libpthread to wait for termination of lookups.  */
> -void (*_dl_wait_lookup_done) (void);
> -
> -#if !THREAD_GSCOPE_IN_TCB
> +#if THREAD_GSCOPE_IN_TCB
> +list_t _dl_stack_used;
> +list_t _dl_stack_user;
> +int _dl_stack_cache_lock;
> +#else
>  int _dl_thread_gscope_count;
>  #endif
>  struct dl_scope_free_list *_dl_scope_free_list;

Ok.

> diff --git a/elf/dl-thread_gscope_wait.c b/elf/dl-thread_gscope_wait.c
> new file mode 100644
> index 0000000000..28e1494730
> --- /dev/null
> +++ b/elf/dl-thread_gscope_wait.c
> @@ -0,0 +1,2 @@
> +/* By default, the dynamic linker does not use an out-of-line
> +   __thread_gscope_wait function.  */

Ok.

> diff --git a/elf/rtld.c b/elf/rtld.c
> index 5d117d0d2c..c4ffc8d4b7 100644
> --- a/elf/rtld.c
> +++ b/elf/rtld.c
> @@ -48,6 +48,7 @@
>  #include <array_length.h>
>  #include <libc-early-init.h>
>  #include <dl-main.h>
> +#include <list.h>
>  
>  #include <assert.h>
>  
> @@ -799,6 +800,9 @@ cannot allocate TLS data structures for initial thread\n");
>    const char *lossage = TLS_INIT_TP (tcbp);
>    if (__glibc_unlikely (lossage != NULL))
>      _dl_fatal_printf ("cannot set up thread-local storage: %s\n", lossage);
> +#if THREAD_GSCOPE_IN_TCB
> +  list_add (&THREAD_SELF->list, &GL (dl_stack_user));
> +#endif
>    tls_init_tp_called = true;
>  
>    return tcbp;

Ok.

> @@ -1139,6 +1143,11 @@ dl_main (const ElfW(Phdr) *phdr,
>    GL(dl_rtld_unlock_recursive) = rtld_lock_default_unlock_recursive;
>  #endif
>  
> +#if THREAD_GSCOPE_IN_TCB
> +  INIT_LIST_HEAD (&GL (dl_stack_used));
> +  INIT_LIST_HEAD (&GL (dl_stack_user));
> +#endif
> +
>    /* The explicit initialization here is cheaper than processing the reloc
>       in the _rtld_local definition's initializer.  */
>    GL(dl_make_stack_executable_hook) = &_dl_make_stack_executable;

Ok.

> @@ -2383,6 +2392,9 @@ dl_main (const ElfW(Phdr) *phdr,
>        if (__glibc_unlikely (lossage != NULL))
>  	_dl_fatal_printf ("cannot set up thread-local storage: %s\n",
>  			  lossage);
> +#if THREAD_GSCOPE_IN_TCB
> +      list_add (&THREAD_SELF->list, &GL (dl_stack_user));
> +#endif
>      }
>  
>    /* Make sure no new search directories have been added.  */

Ok.

> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index 4b45f8c884..b7f9eeebf6 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -106,26 +106,14 @@
>  static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
>  static size_t stack_cache_actsize;
>  
> -/* Mutex protecting this variable.  */
> -static int stack_cache_lock = LLL_LOCK_INITIALIZER;
> -
>  /* List of queued stack frames.  */
>  static LIST_HEAD (stack_cache);
>  
> -/* List of the stacks in use.  */
> -static LIST_HEAD (stack_used);
> -
>  /* We need to record what list operations we are going to do so that,
>     in case of an asynchronous interruption due to a fork() call, we
>     can correct for the work.  */
>  static uintptr_t in_flight_stack;
>  
> -/* List of the threads with user provided stacks in use.  No need to
> -   initialize this, since it's done in __pthread_initialize_minimal.  */
> -list_t __stack_user __attribute__ ((nocommon));
> -hidden_data_def (__stack_user)
> -
> -
>  /* Check whether the stack is still used or not.  */
>  #define FREE_P(descr) ((descr)->tid <= 0)
>  

Ok.

> @@ -173,7 +161,7 @@ get_cached_stack (size_t *sizep, void **memp)
>    struct pthread *result = NULL;
>    list_t *entry;
>  
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    /* Search the cache for a matching entry.  We search for the
>       smallest stack which has at least the required size.  Note that

Ok.

> @@ -206,7 +194,7 @@ get_cached_stack (size_t *sizep, void **memp)
>        || __builtin_expect (result->stackblock_size > 4 * size, 0))
>      {
>        /* Release the lock.  */
> -      lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>        return NULL;
>      }

Ok.

> @@ -218,13 +206,13 @@ get_cached_stack (size_t *sizep, void **memp)
>    stack_list_del (&result->list);
>  
>    /* And add to the list of stacks in use.  */
> -  stack_list_add (&result->list, &stack_used);
> +  stack_list_add (&result->list, &GL (dl_stack_used));
>  
>    /* And decrease the cache size.  */
>    stack_cache_actsize -= result->stackblock_size;
>  
>    /* Release the lock early.  */
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    /* Report size and location of the stack to the caller.  */
>    *sizep = result->stackblock_size;

Ok.

> @@ -510,12 +498,12 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  
>  
>        /* Prepare to modify global data.  */
> -      lll_lock (stack_cache_lock, LLL_PRIVATE);
> +      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>        /* And add to the list of stacks in use.  */
> -      list_add (&pd->list, &__stack_user);
> +      list_add (&pd->list, &GL (dl_stack_user));
>  
> -      lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>      }
>    else
>      {

Ok.

> @@ -644,12 +632,12 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  
>  
>  	  /* Prepare to modify global data.  */
> -	  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +	  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>  	  /* And add to the list of stacks in use.  */
> -	  stack_list_add (&pd->list, &stack_used);
> +	  stack_list_add (&pd->list, &GL (dl_stack_used));
>  
> -	  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +	  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>  
>  	  /* There might have been a race.  Another thread might have

Ok.

> @@ -690,12 +678,12 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  	  if (__mprotect (guard, guardsize, PROT_NONE) != 0)
>  	    {
>  	    mprot_error:
> -	      lll_lock (stack_cache_lock, LLL_PRIVATE);
> +	      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>  	      /* Remove the thread from the list.  */
>  	      stack_list_del (&pd->list);
>  
> -	      lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +	      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>  	      /* Get rid of the TLS block we allocated.  */
>  	      _dl_deallocate_tls (TLS_TPADJ (pd), false);

Ok.

> @@ -799,7 +787,7 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  void
>  __deallocate_stack (struct pthread *pd)
>  {
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    /* Remove the thread from the list of threads with user defined
>       stacks.  */
> @@ -815,7 +803,7 @@ __deallocate_stack (struct pthread *pd)
>      /* Free the memory associated with the ELF TLS.  */
>      _dl_deallocate_tls (TLS_TPADJ (pd), false);
>  
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  }
>  
>  

Ok.

> @@ -831,10 +819,10 @@ __make_stacks_executable (void **stack_endp)
>    const size_t pagemask = ~(__getpagesize () - 1);
>  #endif
>  
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    list_t *runp;
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      {
>        err = change_stack_perm (list_entry (runp, struct pthread, list)
>  #ifdef NEED_SEPARATE_REGISTER_STACK

Ok.

> @@ -860,7 +848,7 @@ __make_stacks_executable (void **stack_endp)
>  	  break;
>        }
>  
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    return err;
>  }
> @@ -891,8 +879,8 @@ __reclaim_stacks (void)
>  	     pointers at the head of the list are inconsistent.  */
>  	  list_t *l = NULL;
>  
> -	  if (stack_used.next->prev != &stack_used)
> -	    l = &stack_used;
> +	  if (GL (dl_stack_used).next->prev != &GL (dl_stack_used))
> +	    l = &GL (dl_stack_used);
>  	  else if (stack_cache.next->prev != &stack_cache)
>  	    l = &stack_cache;
>  

Ok.

> @@ -914,7 +902,7 @@ __reclaim_stacks (void)
>  
>    /* Mark all stacks except the still running one as free.  */
>    list_t *runp;
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      {
>        struct pthread *curp = list_entry (runp, struct pthread, list);
>        if (curp != self)

Ok.

> @@ -948,7 +936,7 @@ __reclaim_stacks (void)
>      }
>  
>    /* Add the stack of all running threads to the cache.  */
> -  list_splice (&stack_used, &stack_cache);
> +  list_splice (&GL (dl_stack_used), &stack_cache);
>  
>    /* Remove the entry for the current thread to from the cache list
>       and add it to the list of running threads.  Which of the two

Ok.

> @@ -956,13 +944,13 @@ __reclaim_stacks (void)
>    stack_list_del (&self->list);
>  
>    /* Re-initialize the lists for all the threads.  */
> -  INIT_LIST_HEAD (&stack_used);
> -  INIT_LIST_HEAD (&__stack_user);
> +  INIT_LIST_HEAD (&GL (dl_stack_used));
> +  INIT_LIST_HEAD (&GL (dl_stack_user));
>  
>    if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
> -    list_add (&self->list, &__stack_user);
> +    list_add (&self->list, &GL (dl_stack_user));
>    else
> -    list_add (&self->list, &stack_used);
> +    list_add (&self->list, &GL (dl_stack_used));
>  
>    /* There is one thread running.  */
>    __nptl_nthreads = 1;

Ok.

> @@ -970,7 +958,7 @@ __reclaim_stacks (void)
>    in_flight_stack = 0;
>  
>    /* Initialize locks.  */
> -  stack_cache_lock = LLL_LOCK_INITIALIZER;
> +  GL (dl_stack_cache_lock) = LLL_LOCK_INITIALIZER;
>    __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
>  }
>  

Ok.

> @@ -1083,7 +1071,7 @@ __nptl_setxid (struct xid_command *cmdp)
>  {
>    int signalled;
>    int result;
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    __xidcmd = cmdp;
>    cmdp->cntr = 0;
> @@ -1093,7 +1081,7 @@ __nptl_setxid (struct xid_command *cmdp)
>  
>    /* Iterate over the list with system-allocated threads first.  */
>    list_t *runp;
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      {
>        struct pthread *t = list_entry (runp, struct pthread, list);
>        if (t == self)
> @@ -1103,7 +1091,7 @@ __nptl_setxid (struct xid_command *cmdp)
>      }
>  
>    /* Now the list with threads using user-allocated stacks.  */
> -  list_for_each (runp, &__stack_user)
> +  list_for_each (runp, &GL (dl_stack_user))
>      {
>        struct pthread *t = list_entry (runp, struct pthread, list);
>        if (t == self)
> @@ -1119,7 +1107,7 @@ __nptl_setxid (struct xid_command *cmdp)
>      {
>        signalled = 0;
>  
> -      list_for_each (runp, &stack_used)
> +      list_for_each (runp, &GL (dl_stack_used))
>  	{
>  	  struct pthread *t = list_entry (runp, struct pthread, list);
>  	  if (t == self)
> @@ -1128,7 +1116,7 @@ __nptl_setxid (struct xid_command *cmdp)
>  	  signalled += setxid_signal_thread (cmdp, t);
>  	}
>  
> -      list_for_each (runp, &__stack_user)
> +      list_for_each (runp, &GL (dl_stack_user))
>  	{
>  	  struct pthread *t = list_entry (runp, struct pthread, list);
>  	  if (t == self)
> @@ -1149,7 +1137,7 @@ __nptl_setxid (struct xid_command *cmdp)
>  
>    /* Clean up flags, so that no thread blocks during exit waiting
>       for a signal which will never come.  */
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      {
>        struct pthread *t = list_entry (runp, struct pthread, list);
>        if (t == self)
> @@ -1158,7 +1146,7 @@ __nptl_setxid (struct xid_command *cmdp)
>        setxid_unmark_thread (cmdp, t);
>      }
>  
> -  list_for_each (runp, &__stack_user)
> +  list_for_each (runp, &GL (dl_stack_user))
>      {
>        struct pthread *t = list_entry (runp, struct pthread, list);
>        if (t == self)
> @@ -1180,7 +1168,7 @@ __nptl_setxid (struct xid_command *cmdp)
>      }
>    __nptl_setxid_error (cmdp, error);
>  
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>    return result;
>  }
>  

Ok.

> @@ -1204,75 +1192,16 @@ void
>  attribute_hidden
>  __pthread_init_static_tls (struct link_map *map)
>  {
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    /* Iterate over the list with system-allocated threads first.  */
>    list_t *runp;
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      init_one_static_tls (list_entry (runp, struct pthread, list), map);
>  
>    /* Now the list with threads using user-allocated stacks.  */
> -  list_for_each (runp, &__stack_user)
> +  list_for_each (runp, &GL (dl_stack_user))
>      init_one_static_tls (list_entry (runp, struct pthread, list), map);
>  

Ok.

> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> -}
> -
> -
> -void
> -attribute_hidden
> -__wait_lookup_done (void)
> -{
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> -
> -  struct pthread *self = THREAD_SELF;
> -
> -  /* Iterate over the list with system-allocated threads first.  */
> -  list_t *runp;
> -  list_for_each (runp, &stack_used)
> -    {
> -      struct pthread *t = list_entry (runp, struct pthread, list);
> -      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
> -	continue;
> -
> -      int *const gscope_flagp = &t->header.gscope_flag;
> -
> -      /* We have to wait until this thread is done with the global
> -	 scope.  First tell the thread that we are waiting and
> -	 possibly have to be woken.  */
> -      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
> -						THREAD_GSCOPE_FLAG_WAIT,
> -						THREAD_GSCOPE_FLAG_USED))
> -	continue;
> -
> -      do
> -	futex_wait_simple ((unsigned int *) gscope_flagp,
> -			   THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
> -      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
> -    }
> -
> -  /* Now the list with threads using user-allocated stacks.  */
> -  list_for_each (runp, &__stack_user)
> -    {
> -      struct pthread *t = list_entry (runp, struct pthread, list);
> -      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
> -	continue;
> -
> -      int *const gscope_flagp = &t->header.gscope_flag;
> -
> -      /* We have to wait until this thread is done with the global
> -	 scope.  First tell the thread that we are waiting and
> -	 possibly have to be woken.  */
> -      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
> -						THREAD_GSCOPE_FLAG_WAIT,
> -						THREAD_GSCOPE_FLAG_USED))
> -	continue;
> -
> -      do
> -	futex_wait_simple ((unsigned int *) gscope_flagp,
> -			   THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
> -      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
> -    }
> -
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  }

Ok.

> diff --git a/nptl/descr.h b/nptl/descr.h
> index d8343ff9a1..b172ee408b 100644
> --- a/nptl/descr.h
> +++ b/nptl/descr.h
> @@ -162,7 +162,8 @@ struct pthread
>      void *__padding[24];
>    };
>  
> -  /* This descriptor's link on the `stack_used' or `__stack_user' list.  */
> +  /* This descriptor's link on the GL (dl_stack_used) or
> +     GL (dl_stack_user) list.  */
>    list_t list;
>  
>    /* Thread ID - which is also a 'is this thread descriptor (and

Ok.

> diff --git a/nptl/nptl-init.c b/nptl/nptl-init.c
> index 4aa1231bec..53b817715d 100644
> --- a/nptl/nptl-init.c
> +++ b/nptl/nptl-init.c
> @@ -251,12 +251,9 @@ __pthread_initialize_minimal_internal (void)
>       purposes this is good enough.  */
>    THREAD_SETMEM (pd, stackblock_size, (size_t) __libc_stack_end);
>  
> -  /* Initialize the list of all running threads with the main thread.  */
> -  INIT_LIST_HEAD (&__stack_user);
> -  list_add (&pd->list, &__stack_user);
> -
> -  /* Before initializing __stack_user, the debugger could not find us and
> -     had to set __nptl_initial_report_events.  Propagate its setting.  */
> +  /* Before initializing GL (dl_stack_user), the debugger could not
> +     find us and had to set __nptl_initial_report_events.  Propagate
> +     its setting.  */
>    THREAD_SETMEM (pd, report_events, __nptl_initial_report_events);
>  
>    struct sigaction sa;

Ok.

> @@ -336,8 +333,6 @@ __pthread_initialize_minimal_internal (void)
>  
>    GL(dl_init_static_tls) = &__pthread_init_static_tls;
>  
> -  GL(dl_wait_lookup_done) = &__wait_lookup_done;
> -
>    /* Register the fork generation counter with the libc.  */

Ok.

>  #ifndef TLS_MULTIPLE_THREADS_IN_TCB
>    __libc_multiple_threads_ptr =
> diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
> index 686f54aa93..a7510f9f63 100644
> --- a/nptl/pthreadP.h
> +++ b/nptl/pthreadP.h
> @@ -208,10 +208,6 @@ extern void __default_pthread_attr_freeres (void) attribute_hidden;
>  extern size_t __static_tls_size attribute_hidden;
>  extern size_t __static_tls_align_m1 attribute_hidden;
>  
> -/* Thread descriptor handling.  */
> -extern list_t __stack_user;
> -hidden_proto (__stack_user)
> -
>  /* Attribute handling.  */
>  extern struct pthread_attr *__attr_list attribute_hidden;
>  extern int __attr_list_lock attribute_hidden;

Ok.

> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 447f005ece..bad4e57a84 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -213,9 +213,9 @@ __find_in_stack_list (struct pthread *pd)
>    list_t *entry;
>    struct pthread *result = NULL;
>  
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
> -  list_for_each (entry, &stack_used)
> +  list_for_each (entry, &GL (dl_stack_used))
>      {
>        struct pthread *curp;
>  
> @@ -228,7 +228,7 @@ __find_in_stack_list (struct pthread *pd)
>      }
>  
>    if (result == NULL)
> -    list_for_each (entry, &__stack_user)
> +    list_for_each (entry, &GL (dl_stack_user))
>        {
>  	struct pthread *curp;
>  
> @@ -240,7 +240,7 @@ __find_in_stack_list (struct pthread *pd)
>  	  }
>        }
>  
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    return result;
>  }

Ok.

> diff --git a/nptl_db/structs.def b/nptl_db/structs.def
> index 472e9008fa..2ca33598b1 100644
> --- a/nptl_db/structs.def
> +++ b/nptl_db/structs.def
> @@ -69,8 +69,6 @@ DB_STRUCT (td_eventbuf_t)
>  DB_STRUCT_FIELD (td_eventbuf_t, eventnum)
>  DB_STRUCT_FIELD (td_eventbuf_t, eventdata)
>  
> -DB_SYMBOL (stack_used)
> -DB_SYMBOL (__stack_user)
>  DB_SYMBOL (nptl_version)
>  DB_FUNCTION (__nptl_create_event)
>  DB_FUNCTION (__nptl_death_event)
> @@ -106,6 +104,8 @@ DB_STRUCT (rtld_global)
>  DB_RTLD_VARIABLE (_rtld_global)
>  #endif
>  DB_RTLD_GLOBAL_FIELD (dl_tls_dtv_slotinfo_list)
> +DB_RTLD_GLOBAL_FIELD (dl_stack_user)
> +DB_RTLD_GLOBAL_FIELD (dl_stack_used)
>  
>  DB_STRUCT (dtv_slotinfo_list)
>  DB_STRUCT_FIELD (dtv_slotinfo_list, len)

Ok.

> diff --git a/nptl_db/td_init.c b/nptl_db/td_init.c
> index 86773dcb06..79a1f90312 100644
> --- a/nptl_db/td_init.c
> +++ b/nptl_db/td_init.c
> @@ -29,3 +29,17 @@ td_init (void)
>    LOG ("td_init");
>    return TD_OK;
>  }
> +
> +bool
> +__td_ta_rtld_global (td_thragent_t *ta)
> +{
> +  if (ta->ta_addr__rtld_global == 0
> +      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
> +                        &ta->ta_addr__rtld_global) != PS_OK)
> +    {
> +      ta->ta_addr__rtld_global = (void*)-1;
> +      return false;
> +    }
> +  else
> +    return ta->ta_addr__rtld_global != (void*)-1;
> +}

ok.

> diff --git a/nptl_db/td_ta_map_lwp2thr.c b/nptl_db/td_ta_map_lwp2thr.c
> index d04d4bec23..81ff48808c 100644
> --- a/nptl_db/td_ta_map_lwp2thr.c
> +++ b/nptl_db/td_ta_map_lwp2thr.c
> @@ -187,7 +187,7 @@ td_ta_map_lwp2thr (const td_thragent_t *ta_arg,
>       fake a special descriptor for the initial thread.  */
>  
>    psaddr_t list;
> -  td_err_e err = DB_GET_SYMBOL (list, ta, __stack_user);
> +  td_err_e err = __td_ta_stack_user (ta, &list);
>    if (err != TD_OK)
>      return err;
>  

Ok.

> diff --git a/nptl_db/td_ta_thr_iter.c b/nptl_db/td_ta_thr_iter.c
> index d59782bd6e..e406bcbd39 100644
> --- a/nptl_db/td_ta_thr_iter.c
> +++ b/nptl_db/td_ta_thr_iter.c
> @@ -133,14 +133,14 @@ td_ta_thr_iter (const td_thragent_t *ta_arg, td_thr_iter_f *callback,
>       have to iterate over both lists separately.  We start with the
>       list of threads with user-defined stacks.  */
>  
> -  err = DB_GET_SYMBOL (list, ta, __stack_user);
> +  err = __td_ta_stack_user (ta, &list);
>    if (err == TD_OK)
>      err = iterate_thread_list (ta, callback, cbdata_p, state, ti_pri,
>  			       list, true);
>  
>    /* And the threads with stacks allocated by the implementation.  */
>    if (err == TD_OK)
> -    err = DB_GET_SYMBOL (list, ta, stack_used);
> +    err = __td_ta_stack_used (ta, &list);
>    if (err == TD_OK)
>      err = iterate_thread_list (ta, callback, cbdata_p, state, ti_pri,
>  			       list, false);

Ok.

> diff --git a/nptl_db/td_thr_tlsbase.c b/nptl_db/td_thr_tlsbase.c
> index 8198934112..eeba0a04d8 100644
> --- a/nptl_db/td_thr_tlsbase.c
> +++ b/nptl_db/td_thr_tlsbase.c
> @@ -28,12 +28,7 @@ dtv_slotinfo_list (td_thragent_t *ta,
>    td_err_e err;
>    psaddr_t head;
>  
> -  if (ta->ta_addr__rtld_global == 0
> -      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
> -			&ta->ta_addr__rtld_global) != PS_OK)
> -    ta->ta_addr__rtld_global = (void*)-1;
> -
> -  if (ta->ta_addr__rtld_global != (void*)-1)
> +  if (__td_ta_rtld_global (ta))
>      {
>        err = DB_GET_FIELD (head, ta, ta->ta_addr__rtld_global,
>  			  rtld_global, _dl_tls_dtv_slotinfo_list, 0);
> diff --git a/nptl_db/td_thr_validate.c b/nptl_db/td_thr_validate.c

Ok.

> index c709c5e9be..d2f4107cf2 100644
> --- a/nptl_db/td_thr_validate.c
> +++ b/nptl_db/td_thr_validate.c
> @@ -20,6 +20,41 @@
>  #include "thread_dbP.h"
>  #include <stdbool.h>
>  
> +td_err_e
> +__td_ta_stack_user (td_thragent_t *ta, psaddr_t *plist)
> +{
> +  if (__td_ta_rtld_global (ta))
> +    return DB_GET_FIELD_ADDRESS (*plist, ta, ta->ta_addr__rtld_global,
> +				 rtld_global, _dl_stack_user, 0);
> +  else
> +    {
> +      if (ta->ta_addr__dl_stack_user == 0
> +	  && td_mod_lookup (ta->ph, NULL, SYM__dl_stack_user,
> +			    &ta->ta_addr__dl_stack_user) != PS_OK)
> +	return TD_ERR;
> +      *plist = ta->ta_addr__dl_stack_user;
> +      return TD_OK;
> +    }
> +}
> +
> +td_err_e
> +__td_ta_stack_used (td_thragent_t *ta, psaddr_t *plist)
> +{
> +
> +  if (__td_ta_rtld_global (ta))
> +    return DB_GET_FIELD_ADDRESS (*plist, ta, ta->ta_addr__rtld_global,
> +				 rtld_global, _dl_stack_used, 0);
> +  else
> +    {
> +      if (ta->ta_addr__dl_stack_used == 0
> +	  && td_mod_lookup (ta->ph, NULL, SYM__dl_stack_used,
> +			    &ta->ta_addr__dl_stack_used) != PS_OK)
> +	return TD_ERR;
> +      *plist = ta->ta_addr__dl_stack_used;
> +      return TD_OK;
> +    }
> +}
> +
>  static td_err_e
>  check_thread_list (const td_thrhandle_t *th, psaddr_t head, bool *uninit)
>  {

Ok.

> @@ -62,7 +97,7 @@ td_thr_validate (const td_thrhandle_t *th)
>  
>    /* First check the list with threads using user allocated stacks.  */
>    bool uninit = false;
> -  err = DB_GET_SYMBOL (list, th->th_ta_p, __stack_user);
> +  err = __td_ta_stack_user (th->th_ta_p, &list);
>    if (err == TD_OK)
>      err = check_thread_list (th, list, &uninit);
>  
> @@ -70,7 +105,7 @@ td_thr_validate (const td_thrhandle_t *th)
>       using implementation allocated stacks.  */
>    if (err == TD_NOTHR)
>      {
> -      err = DB_GET_SYMBOL (list, th->th_ta_p, stack_used);
> +      err = __td_ta_stack_used (th->th_ta_p, &list);
>        if (err == TD_OK)
>  	err = check_thread_list (th, list, &uninit);
>  

Ok.

> diff --git a/nptl_db/thread_dbP.h b/nptl_db/thread_dbP.h
> index 8a666b8d4f..9574e68d70 100644
> --- a/nptl_db/thread_dbP.h
> +++ b/nptl_db/thread_dbP.h
> @@ -269,4 +269,17 @@ extern td_err_e _td_check_sizeof (td_thragent_t *ta, uint32_t *sizep,
>  extern td_err_e __td_ta_lookup_th_unique (const td_thragent_t *ta,
>  					  lwpid_t lwpid, td_thrhandle_t *th);
>  
> +/* Try to initialize TA->ta_addr__rtld_global.  Return true on
> +   success, false on failure (which may be cached).  */
> +bool __td_ta_rtld_global (td_thragent_t *ta) attribute_hidden;
> +
> +/* Obtain the address of the list_t fields _dl_stack_user and
> +   _dl_stack_used in _rtld_global, or fall back to the global
> +   variables of the same name (to support statically linked
> +   programs).  */
> +td_err_e __td_ta_stack_user (td_thragent_t *ta, psaddr_t *plist)
> +  attribute_hidden;
> +td_err_e __td_ta_stack_used (td_thragent_t *ta, psaddr_t *plist)
> +  attribute_hidden;
> +
>  #endif /* thread_dbP.h */

Ok.

> diff --git a/sysdeps/aarch64/nptl/tls.h b/sysdeps/aarch64/nptl/tls.h
> index e5627f777e..dbef4704f1 100644
> --- a/sysdeps/aarch64/nptl/tls.h
> +++ b/sysdeps/aarch64/nptl/tls.h
> @@ -129,8 +129,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  # endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/alpha/nptl/tls.h b/sysdeps/alpha/nptl/tls.h
> index 82549607fd..f3101f57c9 100644
> --- a/sysdeps/alpha/nptl/tls.h
> +++ b/sysdeps/alpha/nptl/tls.h
> @@ -123,8 +123,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>

Ok.

> diff --git a/sysdeps/arc/nptl/tls.h b/sysdeps/arc/nptl/tls.h
> index 184b550ab5..cc5abb1931 100644
> --- a/sysdeps/arc/nptl/tls.h
> +++ b/sysdeps/arc/nptl/tls.h
> @@ -131,8 +131,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* !__ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/arm/nptl/tls.h b/sysdeps/arm/nptl/tls.h
> index 24a488ff37..91dd7498e4 100644
> --- a/sysdeps/arm/nptl/tls.h
> +++ b/sysdeps/arm/nptl/tls.h
> @@ -120,8 +120,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  


Ok.

> diff --git a/sysdeps/csky/nptl/tls.h b/sysdeps/csky/nptl/tls.h
> index bcca9674a1..f308773d40 100644
> --- a/sysdeps/csky/nptl/tls.h
> +++ b/sysdeps/csky/nptl/tls.h
> @@ -147,8 +147,6 @@ typedef struct
>        atomic_write_barrier ();						      \
>      }									      \
>    while (0)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  


Ok.

> diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
> index 382eeb9be0..b1da03cafe 100644
> --- a/sysdeps/generic/ldsodefs.h
> +++ b/sysdeps/generic/ldsodefs.h
> @@ -38,6 +38,7 @@
>  #include <libc-lock.h>
>  #include <hp-timing.h>
>  #include <tls.h>
> +#include <list_t.h>
>  
>  __BEGIN_DECLS
>  
> @@ -461,15 +462,22 @@ struct rtld_global
>  
>    EXTERN void (*_dl_init_static_tls) (struct link_map *);
>  
> -  EXTERN void (*_dl_wait_lookup_done) (void);
> -
>    /* Scopes to free after next THREAD_GSCOPE_WAIT ().  */
>    EXTERN struct dl_scope_free_list
>    {
>      size_t count;
>      void *list[50];
>    } *_dl_scope_free_list;
> -#if !THREAD_GSCOPE_IN_TCB
> +#if THREAD_GSCOPE_IN_TCB
> +  /* List of active thread stacks, with memory managed by glibc.  */
> +  EXTERN list_t _dl_stack_used;
> +
> +  /* List of thread stacks that were allocated by the application.  */
> +  EXTERN list_t _dl_stack_user;
> +
> +  /* Mutex protecting the stack lists.  */
> +  EXTERN int _dl_stack_cache_lock;
> +#else
>    EXTERN int _dl_thread_gscope_count;
>  #endif
>  #ifdef SHARED
> @@ -1252,6 +1260,11 @@ link_map_audit_state (struct link_map *l, size_t index)
>  }
>  #endif /* SHARED */
>  
> +#if THREAD_GSCOPE_IN_TCB
> +void __thread_gscope_wait (void) attribute_hidden;
> +# define THREAD_GSCOPE_WAIT() __thread_gscope_wait ()
> +#endif
> +
>  __END_DECLS
>  
>  #endif /* ldsodefs.h */

Ok.

> diff --git a/sysdeps/hppa/nptl/tls.h b/sysdeps/hppa/nptl/tls.h
> index 2315340735..f319cb42e2 100644
> --- a/sysdeps/hppa/nptl/tls.h
> +++ b/sysdeps/hppa/nptl/tls.h
> @@ -154,8 +154,6 @@ static inline void __set_cr27(struct pthread *cr27)
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* !__ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/i386/nptl/tls.h b/sysdeps/i386/nptl/tls.h
> index 5042d52b98..b74347bacd 100644
> --- a/sysdeps/i386/nptl/tls.h
> +++ b/sysdeps/i386/nptl/tls.h
> @@ -387,8 +387,6 @@ tls_fill_user_desc (union user_desc_init *desc,
>    while (0)
>  #define THREAD_GSCOPE_SET_FLAG() \
>    THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/ia64/nptl/tls.h b/sysdeps/ia64/nptl/tls.h
> index f5b1684ef3..42b082dad6 100644
> --- a/sysdeps/ia64/nptl/tls.h
> +++ b/sysdeps/ia64/nptl/tls.h
> @@ -175,8 +175,6 @@ register struct pthread *__thread_self __asm__("r13");
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/m68k/nptl/tls.h b/sysdeps/m68k/nptl/tls.h
> index 68ea952e79..69e174484e 100644
> --- a/sysdeps/m68k/nptl/tls.h
> +++ b/sysdeps/m68k/nptl/tls.h
> @@ -153,8 +153,6 @@ extern void * __m68k_read_tp (void);
>        atomic_write_barrier ();						\
>      }									\
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/microblaze/nptl/tls.h b/sysdeps/microblaze/nptl/tls.h
> index a094fdb798..78f63019dc 100644
> --- a/sysdeps/microblaze/nptl/tls.h
> +++ b/sysdeps/microblaze/nptl/tls.h
> @@ -136,8 +136,6 @@ typedef struct
>        atomic_write_barrier ();                                              \
>      }                                                                       \
>    while (0)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL (dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/mips/nptl/tls.h b/sysdeps/mips/nptl/tls.h
> index 8b55f19c37..7e7ac43d6c 100644
> --- a/sysdeps/mips/nptl/tls.h
> +++ b/sysdeps/mips/nptl/tls.h
> @@ -178,8 +178,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/nios2/nptl/tls.h b/sysdeps/nios2/nptl/tls.h
> index facb27c761..776fe9bb2d 100644
> --- a/sysdeps/nios2/nptl/tls.h
> +++ b/sysdeps/nios2/nptl/tls.h
> @@ -157,8 +157,6 @@ register struct pthread *__thread_self __asm__("r23");
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/nptl/dl-thread_gscope_wait.c b/sysdeps/nptl/dl-thread_gscope_wait.c
> new file mode 100644
> index 0000000000..a9bfbee24c
> --- /dev/null
> +++ b/sysdeps/nptl/dl-thread_gscope_wait.c
> @@ -0,0 +1,80 @@
> +/* Out-of-line notification function for the GSCOPE locking mechanism.
> +   Copyright (C) 2007-2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <nptl/descr.h>
> +#include <futex-internal.h>
> +#include <ldsodefs.h>
> +#include <list.h>
> +#include <lowlevellock.h>
> +
> +void
> +__thread_gscope_wait (void)
> +{
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
> +
> +  struct pthread *self = THREAD_SELF;
> +
> +  /* Iterate over the list with system-allocated threads first.  */
> +  list_t *runp;
> +  list_for_each (runp, &GL (dl_stack_used))
> +    {
> +      struct pthread *t = list_entry (runp, struct pthread, list);
> +      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
> +        continue;
> +
> +      int *const gscope_flagp = &t->header.gscope_flag;
> +
> +      /* We have to wait until this thread is done with the global
> +         scope.  First tell the thread that we are waiting and
> +         possibly have to be woken.  */
> +      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
> +                                                THREAD_GSCOPE_FLAG_WAIT,
> +                                                THREAD_GSCOPE_FLAG_USED))
> +        continue;
> +
> +      do
> +        futex_wait_simple ((unsigned int *) gscope_flagp,
> +                           THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
> +      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
> +    }
> +
> +  /* Now the list with threads using user-allocated stacks.  */
> +  list_for_each (runp, &GL (dl_stack_user))
> +    {
> +      struct pthread *t = list_entry (runp, struct pthread, list);
> +      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
> +        continue;
> +
> +      int *const gscope_flagp = &t->header.gscope_flag;
> +
> +      /* We have to wait until this thread is done with the global
> +         scope.  First tell the thread that we are waiting and
> +         possibly have to be woken.  */
> +      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
> +                                                THREAD_GSCOPE_FLAG_WAIT,
> +                                                THREAD_GSCOPE_FLAG_USED))
> +        continue;
> +
> +      do
> +        futex_wait_simple ((unsigned int *) gscope_flagp,
> +                           THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
> +      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
> +    }
> +
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
> +}

Ok, this is basically __wait_lookup_done.

> diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
> index b1f0b30fbb..261eecfd18 100644
> --- a/sysdeps/powerpc/nptl/tls.h
> +++ b/sysdeps/powerpc/nptl/tls.h
> @@ -245,8 +245,6 @@ register void *__thread_register __asm__ ("r13");
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/riscv/nptl/tls.h b/sysdeps/riscv/nptl/tls.h
> index a9167bc143..41d9db10cf 100644
> --- a/sysdeps/riscv/nptl/tls.h
> +++ b/sysdeps/riscv/nptl/tls.h
> @@ -139,8 +139,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/s390/nptl/tls.h b/sysdeps/s390/nptl/tls.h
> index 4a9b3570eb..7653109617 100644
> --- a/sysdeps/s390/nptl/tls.h
> +++ b/sysdeps/s390/nptl/tls.h
> @@ -185,8 +185,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/sh/nptl/tls.h b/sysdeps/sh/nptl/tls.h
> index 6ccad251f9..2d5e3731b2 100644
> --- a/sysdeps/sh/nptl/tls.h
> +++ b/sysdeps/sh/nptl/tls.h
> @@ -161,8 +161,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/sparc/nptl/tls.h b/sysdeps/sparc/nptl/tls.h
> index 18a548e86b..b08a156131 100644
> --- a/sysdeps/sparc/nptl/tls.h
> +++ b/sysdeps/sparc/nptl/tls.h
> @@ -158,8 +158,6 @@ register struct pthread *__thread_self __asm__("%g7");
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* !ASSEMBLER */
>  

Ok.

> diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
> index fbd7f9cb89..a08bf972de 100644
> --- a/sysdeps/x86_64/nptl/tls.h
> +++ b/sysdeps/x86_64/nptl/tls.h
> @@ -332,8 +332,6 @@ _Static_assert (offsetof (tcbhead_t, __glibc_unused2) == 0x80,
>    while (0)
>  # define THREAD_GSCOPE_SET_FLAG() \
>    THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  
> 

Ok.
diff mbox series

Patch

diff --git a/csu/libc-tls.c b/csu/libc-tls.c
index 06e76bd395..c3589f0a7d 100644
--- a/csu/libc-tls.c
+++ b/csu/libc-tls.c
@@ -24,6 +24,7 @@ 
 #include <stdio.h>
 #include <sys/param.h>
 #include <array_length.h>
+#include <list.h>
 
 #ifdef SHARED
  #error makefile bug, this file is for static only
@@ -193,6 +194,12 @@  __libc_setup_tls (void)
   if (__builtin_expect (lossage != NULL, 0))
     _startup_fatal (lossage);
 
+#if THREAD_GSCOPE_IN_TCB
+  INIT_LIST_HEAD (&_dl_stack_used);
+  INIT_LIST_HEAD (&_dl_stack_user);
+  list_add (&THREAD_SELF->list, &_dl_stack_user);
+#endif
+
   /* Update the executable's link map with enough information to make
      the TLS routines happy.  */
   main_map->l_tls_align = align;
diff --git a/elf/Makefile b/elf/Makefile
index f10cc59e7c..2015383eb2 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -34,7 +34,8 @@  dl-routines	= $(addprefix dl-,load lookup object reloc deps \
 				  version profile tls origin scope \
 				  execstack open close trampoline \
 				  exception sort-maps lookup-direct \
-				  call-libc-early-init write)
+				  call-libc-early-init write \
+				  thread_gscope_wait)
 ifeq (yes,$(use-ldconfig))
 dl-routines += dl-cache
 endif
@@ -54,7 +55,8 @@  endif
 all-dl-routines = $(dl-routines) $(sysdep-dl-routines)
 # But they are absent from the shared libc, because that code is in ld.so.
 elide-routines.os = $(all-dl-routines) dl-support enbl-secure dl-origin \
-		    dl-sysdep dl-exception dl-reloc-static-pie
+		    dl-sysdep dl-exception dl-reloc-static-pie \
+		    thread_gscope_wait
 
 # ld.so uses those routines, plus some special stuff for being the program
 # interpreter and operating independent of libc.
diff --git a/elf/dl-support.c b/elf/dl-support.c
index afbc94df54..15e6d787c9 100644
--- a/elf/dl-support.c
+++ b/elf/dl-support.c
@@ -183,10 +183,11 @@  ElfW(Word) _dl_stack_flags = DEFAULT_STACK_PERMS;
 int (*_dl_make_stack_executable_hook) (void **) = _dl_make_stack_executable;
 
 
-/* Function in libpthread to wait for termination of lookups.  */
-void (*_dl_wait_lookup_done) (void);
-
-#if !THREAD_GSCOPE_IN_TCB
+#if THREAD_GSCOPE_IN_TCB
+list_t _dl_stack_used;
+list_t _dl_stack_user;
+int _dl_stack_cache_lock;
+#else
 int _dl_thread_gscope_count;
 #endif
 struct dl_scope_free_list *_dl_scope_free_list;
diff --git a/elf/dl-thread_gscope_wait.c b/elf/dl-thread_gscope_wait.c
new file mode 100644
index 0000000000..28e1494730
--- /dev/null
+++ b/elf/dl-thread_gscope_wait.c
@@ -0,0 +1,2 @@ 
+/* By default, the dynamic linker does not use an out-of-line
+   __thread_gscope_wait function.  */
diff --git a/elf/rtld.c b/elf/rtld.c
index 5d117d0d2c..c4ffc8d4b7 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -48,6 +48,7 @@ 
 #include <array_length.h>
 #include <libc-early-init.h>
 #include <dl-main.h>
+#include <list.h>
 
 #include <assert.h>
 
@@ -799,6 +800,9 @@  cannot allocate TLS data structures for initial thread\n");
   const char *lossage = TLS_INIT_TP (tcbp);
   if (__glibc_unlikely (lossage != NULL))
     _dl_fatal_printf ("cannot set up thread-local storage: %s\n", lossage);
+#if THREAD_GSCOPE_IN_TCB
+  list_add (&THREAD_SELF->list, &GL (dl_stack_user));
+#endif
   tls_init_tp_called = true;
 
   return tcbp;
@@ -1139,6 +1143,11 @@  dl_main (const ElfW(Phdr) *phdr,
   GL(dl_rtld_unlock_recursive) = rtld_lock_default_unlock_recursive;
 #endif
 
+#if THREAD_GSCOPE_IN_TCB
+  INIT_LIST_HEAD (&GL (dl_stack_used));
+  INIT_LIST_HEAD (&GL (dl_stack_user));
+#endif
+
   /* The explicit initialization here is cheaper than processing the reloc
      in the _rtld_local definition's initializer.  */
   GL(dl_make_stack_executable_hook) = &_dl_make_stack_executable;
@@ -2383,6 +2392,9 @@  dl_main (const ElfW(Phdr) *phdr,
       if (__glibc_unlikely (lossage != NULL))
 	_dl_fatal_printf ("cannot set up thread-local storage: %s\n",
 			  lossage);
+#if THREAD_GSCOPE_IN_TCB
+      list_add (&THREAD_SELF->list, &GL (dl_stack_user));
+#endif
     }
 
   /* Make sure no new search directories have been added.  */
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 4b45f8c884..b7f9eeebf6 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -106,26 +106,14 @@ 
 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 static size_t stack_cache_actsize;
 
-/* Mutex protecting this variable.  */
-static int stack_cache_lock = LLL_LOCK_INITIALIZER;
-
 /* List of queued stack frames.  */
 static LIST_HEAD (stack_cache);
 
-/* List of the stacks in use.  */
-static LIST_HEAD (stack_used);
-
 /* We need to record what list operations we are going to do so that,
    in case of an asynchronous interruption due to a fork() call, we
    can correct for the work.  */
 static uintptr_t in_flight_stack;
 
-/* List of the threads with user provided stacks in use.  No need to
-   initialize this, since it's done in __pthread_initialize_minimal.  */
-list_t __stack_user __attribute__ ((nocommon));
-hidden_data_def (__stack_user)
-
-
 /* Check whether the stack is still used or not.  */
 #define FREE_P(descr) ((descr)->tid <= 0)
 
@@ -173,7 +161,7 @@  get_cached_stack (size_t *sizep, void **memp)
   struct pthread *result = NULL;
   list_t *entry;
 
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   /* Search the cache for a matching entry.  We search for the
      smallest stack which has at least the required size.  Note that
@@ -206,7 +194,7 @@  get_cached_stack (size_t *sizep, void **memp)
       || __builtin_expect (result->stackblock_size > 4 * size, 0))
     {
       /* Release the lock.  */
-      lll_unlock (stack_cache_lock, LLL_PRIVATE);
+      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
       return NULL;
     }
@@ -218,13 +206,13 @@  get_cached_stack (size_t *sizep, void **memp)
   stack_list_del (&result->list);
 
   /* And add to the list of stacks in use.  */
-  stack_list_add (&result->list, &stack_used);
+  stack_list_add (&result->list, &GL (dl_stack_used));
 
   /* And decrease the cache size.  */
   stack_cache_actsize -= result->stackblock_size;
 
   /* Release the lock early.  */
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   /* Report size and location of the stack to the caller.  */
   *sizep = result->stackblock_size;
@@ -510,12 +498,12 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 
 
       /* Prepare to modify global data.  */
-      lll_lock (stack_cache_lock, LLL_PRIVATE);
+      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
       /* And add to the list of stacks in use.  */
-      list_add (&pd->list, &__stack_user);
+      list_add (&pd->list, &GL (dl_stack_user));
 
-      lll_unlock (stack_cache_lock, LLL_PRIVATE);
+      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     }
   else
     {
@@ -644,12 +632,12 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 
 
 	  /* Prepare to modify global data.  */
-	  lll_lock (stack_cache_lock, LLL_PRIVATE);
+	  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
 	  /* And add to the list of stacks in use.  */
-	  stack_list_add (&pd->list, &stack_used);
+	  stack_list_add (&pd->list, &GL (dl_stack_used));
 
-	  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+	  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
 
 	  /* There might have been a race.  Another thread might have
@@ -690,12 +678,12 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 	  if (__mprotect (guard, guardsize, PROT_NONE) != 0)
 	    {
 	    mprot_error:
-	      lll_lock (stack_cache_lock, LLL_PRIVATE);
+	      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
 	      /* Remove the thread from the list.  */
 	      stack_list_del (&pd->list);
 
-	      lll_unlock (stack_cache_lock, LLL_PRIVATE);
+	      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
 	      /* Get rid of the TLS block we allocated.  */
 	      _dl_deallocate_tls (TLS_TPADJ (pd), false);
@@ -799,7 +787,7 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 void
 __deallocate_stack (struct pthread *pd)
 {
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   /* Remove the thread from the list of threads with user defined
      stacks.  */
@@ -815,7 +803,7 @@  __deallocate_stack (struct pthread *pd)
     /* Free the memory associated with the ELF TLS.  */
     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 }
 
 
@@ -831,10 +819,10 @@  __make_stacks_executable (void **stack_endp)
   const size_t pagemask = ~(__getpagesize () - 1);
 #endif
 
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   list_t *runp;
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     {
       err = change_stack_perm (list_entry (runp, struct pthread, list)
 #ifdef NEED_SEPARATE_REGISTER_STACK
@@ -860,7 +848,7 @@  __make_stacks_executable (void **stack_endp)
 	  break;
       }
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   return err;
 }
@@ -891,8 +879,8 @@  __reclaim_stacks (void)
 	     pointers at the head of the list are inconsistent.  */
 	  list_t *l = NULL;
 
-	  if (stack_used.next->prev != &stack_used)
-	    l = &stack_used;
+	  if (GL (dl_stack_used).next->prev != &GL (dl_stack_used))
+	    l = &GL (dl_stack_used);
 	  else if (stack_cache.next->prev != &stack_cache)
 	    l = &stack_cache;
 
@@ -914,7 +902,7 @@  __reclaim_stacks (void)
 
   /* Mark all stacks except the still running one as free.  */
   list_t *runp;
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     {
       struct pthread *curp = list_entry (runp, struct pthread, list);
       if (curp != self)
@@ -948,7 +936,7 @@  __reclaim_stacks (void)
     }
 
   /* Add the stack of all running threads to the cache.  */
-  list_splice (&stack_used, &stack_cache);
+  list_splice (&GL (dl_stack_used), &stack_cache);
 
   /* Remove the entry for the current thread to from the cache list
      and add it to the list of running threads.  Which of the two
@@ -956,13 +944,13 @@  __reclaim_stacks (void)
   stack_list_del (&self->list);
 
   /* Re-initialize the lists for all the threads.  */
-  INIT_LIST_HEAD (&stack_used);
-  INIT_LIST_HEAD (&__stack_user);
+  INIT_LIST_HEAD (&GL (dl_stack_used));
+  INIT_LIST_HEAD (&GL (dl_stack_user));
 
   if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
-    list_add (&self->list, &__stack_user);
+    list_add (&self->list, &GL (dl_stack_user));
   else
-    list_add (&self->list, &stack_used);
+    list_add (&self->list, &GL (dl_stack_used));
 
   /* There is one thread running.  */
   __nptl_nthreads = 1;
@@ -970,7 +958,7 @@  __reclaim_stacks (void)
   in_flight_stack = 0;
 
   /* Initialize locks.  */
-  stack_cache_lock = LLL_LOCK_INITIALIZER;
+  GL (dl_stack_cache_lock) = LLL_LOCK_INITIALIZER;
   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 }
 
@@ -1083,7 +1071,7 @@  __nptl_setxid (struct xid_command *cmdp)
 {
   int signalled;
   int result;
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   __xidcmd = cmdp;
   cmdp->cntr = 0;
@@ -1093,7 +1081,7 @@  __nptl_setxid (struct xid_command *cmdp)
 
   /* Iterate over the list with system-allocated threads first.  */
   list_t *runp;
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     {
       struct pthread *t = list_entry (runp, struct pthread, list);
       if (t == self)
@@ -1103,7 +1091,7 @@  __nptl_setxid (struct xid_command *cmdp)
     }
 
   /* Now the list with threads using user-allocated stacks.  */
-  list_for_each (runp, &__stack_user)
+  list_for_each (runp, &GL (dl_stack_user))
     {
       struct pthread *t = list_entry (runp, struct pthread, list);
       if (t == self)
@@ -1119,7 +1107,7 @@  __nptl_setxid (struct xid_command *cmdp)
     {
       signalled = 0;
 
-      list_for_each (runp, &stack_used)
+      list_for_each (runp, &GL (dl_stack_used))
 	{
 	  struct pthread *t = list_entry (runp, struct pthread, list);
 	  if (t == self)
@@ -1128,7 +1116,7 @@  __nptl_setxid (struct xid_command *cmdp)
 	  signalled += setxid_signal_thread (cmdp, t);
 	}
 
-      list_for_each (runp, &__stack_user)
+      list_for_each (runp, &GL (dl_stack_user))
 	{
 	  struct pthread *t = list_entry (runp, struct pthread, list);
 	  if (t == self)
@@ -1149,7 +1137,7 @@  __nptl_setxid (struct xid_command *cmdp)
 
   /* Clean up flags, so that no thread blocks during exit waiting
      for a signal which will never come.  */
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     {
       struct pthread *t = list_entry (runp, struct pthread, list);
       if (t == self)
@@ -1158,7 +1146,7 @@  __nptl_setxid (struct xid_command *cmdp)
       setxid_unmark_thread (cmdp, t);
     }
 
-  list_for_each (runp, &__stack_user)
+  list_for_each (runp, &GL (dl_stack_user))
     {
       struct pthread *t = list_entry (runp, struct pthread, list);
       if (t == self)
@@ -1180,7 +1168,7 @@  __nptl_setxid (struct xid_command *cmdp)
     }
   __nptl_setxid_error (cmdp, error);
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
   return result;
 }
 
@@ -1204,75 +1192,16 @@  void
 attribute_hidden
 __pthread_init_static_tls (struct link_map *map)
 {
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   /* Iterate over the list with system-allocated threads first.  */
   list_t *runp;
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     init_one_static_tls (list_entry (runp, struct pthread, list), map);
 
   /* Now the list with threads using user-allocated stacks.  */
-  list_for_each (runp, &__stack_user)
+  list_for_each (runp, &GL (dl_stack_user))
     init_one_static_tls (list_entry (runp, struct pthread, list), map);
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
-}
-
-
-void
-attribute_hidden
-__wait_lookup_done (void)
-{
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
-
-  struct pthread *self = THREAD_SELF;
-
-  /* Iterate over the list with system-allocated threads first.  */
-  list_t *runp;
-  list_for_each (runp, &stack_used)
-    {
-      struct pthread *t = list_entry (runp, struct pthread, list);
-      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
-	continue;
-
-      int *const gscope_flagp = &t->header.gscope_flag;
-
-      /* We have to wait until this thread is done with the global
-	 scope.  First tell the thread that we are waiting and
-	 possibly have to be woken.  */
-      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
-						THREAD_GSCOPE_FLAG_WAIT,
-						THREAD_GSCOPE_FLAG_USED))
-	continue;
-
-      do
-	futex_wait_simple ((unsigned int *) gscope_flagp,
-			   THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
-      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
-    }
-
-  /* Now the list with threads using user-allocated stacks.  */
-  list_for_each (runp, &__stack_user)
-    {
-      struct pthread *t = list_entry (runp, struct pthread, list);
-      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
-	continue;
-
-      int *const gscope_flagp = &t->header.gscope_flag;
-
-      /* We have to wait until this thread is done with the global
-	 scope.  First tell the thread that we are waiting and
-	 possibly have to be woken.  */
-      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
-						THREAD_GSCOPE_FLAG_WAIT,
-						THREAD_GSCOPE_FLAG_USED))
-	continue;
-
-      do
-	futex_wait_simple ((unsigned int *) gscope_flagp,
-			   THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
-      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
-    }
-
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 }
diff --git a/nptl/descr.h b/nptl/descr.h
index d8343ff9a1..b172ee408b 100644
--- a/nptl/descr.h
+++ b/nptl/descr.h
@@ -162,7 +162,8 @@  struct pthread
     void *__padding[24];
   };
 
-  /* This descriptor's link on the `stack_used' or `__stack_user' list.  */
+  /* This descriptor's link on the GL (dl_stack_used) or
+     GL (dl_stack_user) list.  */
   list_t list;
 
   /* Thread ID - which is also a 'is this thread descriptor (and
diff --git a/nptl/nptl-init.c b/nptl/nptl-init.c
index 4aa1231bec..53b817715d 100644
--- a/nptl/nptl-init.c
+++ b/nptl/nptl-init.c
@@ -251,12 +251,9 @@  __pthread_initialize_minimal_internal (void)
      purposes this is good enough.  */
   THREAD_SETMEM (pd, stackblock_size, (size_t) __libc_stack_end);
 
-  /* Initialize the list of all running threads with the main thread.  */
-  INIT_LIST_HEAD (&__stack_user);
-  list_add (&pd->list, &__stack_user);
-
-  /* Before initializing __stack_user, the debugger could not find us and
-     had to set __nptl_initial_report_events.  Propagate its setting.  */
+  /* Before initializing GL (dl_stack_user), the debugger could not
+     find us and had to set __nptl_initial_report_events.  Propagate
+     its setting.  */
   THREAD_SETMEM (pd, report_events, __nptl_initial_report_events);
 
   struct sigaction sa;
@@ -336,8 +333,6 @@  __pthread_initialize_minimal_internal (void)
 
   GL(dl_init_static_tls) = &__pthread_init_static_tls;
 
-  GL(dl_wait_lookup_done) = &__wait_lookup_done;
-
   /* Register the fork generation counter with the libc.  */
 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
   __libc_multiple_threads_ptr =
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index 686f54aa93..a7510f9f63 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -208,10 +208,6 @@  extern void __default_pthread_attr_freeres (void) attribute_hidden;
 extern size_t __static_tls_size attribute_hidden;
 extern size_t __static_tls_align_m1 attribute_hidden;
 
-/* Thread descriptor handling.  */
-extern list_t __stack_user;
-hidden_proto (__stack_user)
-
 /* Attribute handling.  */
 extern struct pthread_attr *__attr_list attribute_hidden;
 extern int __attr_list_lock attribute_hidden;
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 447f005ece..bad4e57a84 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -213,9 +213,9 @@  __find_in_stack_list (struct pthread *pd)
   list_t *entry;
   struct pthread *result = NULL;
 
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
-  list_for_each (entry, &stack_used)
+  list_for_each (entry, &GL (dl_stack_used))
     {
       struct pthread *curp;
 
@@ -228,7 +228,7 @@  __find_in_stack_list (struct pthread *pd)
     }
 
   if (result == NULL)
-    list_for_each (entry, &__stack_user)
+    list_for_each (entry, &GL (dl_stack_user))
       {
 	struct pthread *curp;
 
@@ -240,7 +240,7 @@  __find_in_stack_list (struct pthread *pd)
 	  }
       }
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   return result;
 }
diff --git a/nptl_db/structs.def b/nptl_db/structs.def
index 472e9008fa..2ca33598b1 100644
--- a/nptl_db/structs.def
+++ b/nptl_db/structs.def
@@ -69,8 +69,6 @@  DB_STRUCT (td_eventbuf_t)
 DB_STRUCT_FIELD (td_eventbuf_t, eventnum)
 DB_STRUCT_FIELD (td_eventbuf_t, eventdata)
 
-DB_SYMBOL (stack_used)
-DB_SYMBOL (__stack_user)
 DB_SYMBOL (nptl_version)
 DB_FUNCTION (__nptl_create_event)
 DB_FUNCTION (__nptl_death_event)
@@ -106,6 +104,8 @@  DB_STRUCT (rtld_global)
 DB_RTLD_VARIABLE (_rtld_global)
 #endif
 DB_RTLD_GLOBAL_FIELD (dl_tls_dtv_slotinfo_list)
+DB_RTLD_GLOBAL_FIELD (dl_stack_user)
+DB_RTLD_GLOBAL_FIELD (dl_stack_used)
 
 DB_STRUCT (dtv_slotinfo_list)
 DB_STRUCT_FIELD (dtv_slotinfo_list, len)
diff --git a/nptl_db/td_init.c b/nptl_db/td_init.c
index 86773dcb06..79a1f90312 100644
--- a/nptl_db/td_init.c
+++ b/nptl_db/td_init.c
@@ -29,3 +29,17 @@  td_init (void)
   LOG ("td_init");
   return TD_OK;
 }
+
+bool
+__td_ta_rtld_global (td_thragent_t *ta)
+{
+  if (ta->ta_addr__rtld_global == 0
+      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
+                        &ta->ta_addr__rtld_global) != PS_OK)
+    {
+      ta->ta_addr__rtld_global = (void*)-1;
+      return false;
+    }
+  else
+    return ta->ta_addr__rtld_global != (void*)-1;
+}
diff --git a/nptl_db/td_ta_map_lwp2thr.c b/nptl_db/td_ta_map_lwp2thr.c
index d04d4bec23..81ff48808c 100644
--- a/nptl_db/td_ta_map_lwp2thr.c
+++ b/nptl_db/td_ta_map_lwp2thr.c
@@ -187,7 +187,7 @@  td_ta_map_lwp2thr (const td_thragent_t *ta_arg,
      fake a special descriptor for the initial thread.  */
 
   psaddr_t list;
-  td_err_e err = DB_GET_SYMBOL (list, ta, __stack_user);
+  td_err_e err = __td_ta_stack_user (ta, &list);
   if (err != TD_OK)
     return err;
 
diff --git a/nptl_db/td_ta_thr_iter.c b/nptl_db/td_ta_thr_iter.c
index d59782bd6e..e406bcbd39 100644
--- a/nptl_db/td_ta_thr_iter.c
+++ b/nptl_db/td_ta_thr_iter.c
@@ -133,14 +133,14 @@  td_ta_thr_iter (const td_thragent_t *ta_arg, td_thr_iter_f *callback,
      have to iterate over both lists separately.  We start with the
      list of threads with user-defined stacks.  */
 
-  err = DB_GET_SYMBOL (list, ta, __stack_user);
+  err = __td_ta_stack_user (ta, &list);
   if (err == TD_OK)
     err = iterate_thread_list (ta, callback, cbdata_p, state, ti_pri,
 			       list, true);
 
   /* And the threads with stacks allocated by the implementation.  */
   if (err == TD_OK)
-    err = DB_GET_SYMBOL (list, ta, stack_used);
+    err = __td_ta_stack_used (ta, &list);
   if (err == TD_OK)
     err = iterate_thread_list (ta, callback, cbdata_p, state, ti_pri,
 			       list, false);
diff --git a/nptl_db/td_thr_tlsbase.c b/nptl_db/td_thr_tlsbase.c
index 8198934112..eeba0a04d8 100644
--- a/nptl_db/td_thr_tlsbase.c
+++ b/nptl_db/td_thr_tlsbase.c
@@ -28,12 +28,7 @@  dtv_slotinfo_list (td_thragent_t *ta,
   td_err_e err;
   psaddr_t head;
 
-  if (ta->ta_addr__rtld_global == 0
-      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
-			&ta->ta_addr__rtld_global) != PS_OK)
-    ta->ta_addr__rtld_global = (void*)-1;
-
-  if (ta->ta_addr__rtld_global != (void*)-1)
+  if (__td_ta_rtld_global (ta))
     {
       err = DB_GET_FIELD (head, ta, ta->ta_addr__rtld_global,
 			  rtld_global, _dl_tls_dtv_slotinfo_list, 0);
diff --git a/nptl_db/td_thr_validate.c b/nptl_db/td_thr_validate.c
index c709c5e9be..d2f4107cf2 100644
--- a/nptl_db/td_thr_validate.c
+++ b/nptl_db/td_thr_validate.c
@@ -20,6 +20,41 @@ 
 #include "thread_dbP.h"
 #include <stdbool.h>
 
+td_err_e
+__td_ta_stack_user (td_thragent_t *ta, psaddr_t *plist)
+{
+  if (__td_ta_rtld_global (ta))
+    return DB_GET_FIELD_ADDRESS (*plist, ta, ta->ta_addr__rtld_global,
+				 rtld_global, _dl_stack_user, 0);
+  else
+    {
+      if (ta->ta_addr__dl_stack_user == 0
+	  && td_mod_lookup (ta->ph, NULL, SYM__dl_stack_user,
+			    &ta->ta_addr__dl_stack_user) != PS_OK)
+	return TD_ERR;
+      *plist = ta->ta_addr__dl_stack_user;
+      return TD_OK;
+    }
+}
+
+td_err_e
+__td_ta_stack_used (td_thragent_t *ta, psaddr_t *plist)
+{
+
+  if (__td_ta_rtld_global (ta))
+    return DB_GET_FIELD_ADDRESS (*plist, ta, ta->ta_addr__rtld_global,
+				 rtld_global, _dl_stack_used, 0);
+  else
+    {
+      if (ta->ta_addr__dl_stack_used == 0
+	  && td_mod_lookup (ta->ph, NULL, SYM__dl_stack_used,
+			    &ta->ta_addr__dl_stack_used) != PS_OK)
+	return TD_ERR;
+      *plist = ta->ta_addr__dl_stack_used;
+      return TD_OK;
+    }
+}
+
 static td_err_e
 check_thread_list (const td_thrhandle_t *th, psaddr_t head, bool *uninit)
 {
@@ -62,7 +97,7 @@  td_thr_validate (const td_thrhandle_t *th)
 
   /* First check the list with threads using user allocated stacks.  */
   bool uninit = false;
-  err = DB_GET_SYMBOL (list, th->th_ta_p, __stack_user);
+  err = __td_ta_stack_user (th->th_ta_p, &list);
   if (err == TD_OK)
     err = check_thread_list (th, list, &uninit);
 
@@ -70,7 +105,7 @@  td_thr_validate (const td_thrhandle_t *th)
      using implementation allocated stacks.  */
   if (err == TD_NOTHR)
     {
-      err = DB_GET_SYMBOL (list, th->th_ta_p, stack_used);
+      err = __td_ta_stack_used (th->th_ta_p, &list);
       if (err == TD_OK)
 	err = check_thread_list (th, list, &uninit);
 
diff --git a/nptl_db/thread_dbP.h b/nptl_db/thread_dbP.h
index 8a666b8d4f..9574e68d70 100644
--- a/nptl_db/thread_dbP.h
+++ b/nptl_db/thread_dbP.h
@@ -269,4 +269,17 @@  extern td_err_e _td_check_sizeof (td_thragent_t *ta, uint32_t *sizep,
 extern td_err_e __td_ta_lookup_th_unique (const td_thragent_t *ta,
 					  lwpid_t lwpid, td_thrhandle_t *th);
 
+/* Try to initialize TA->ta_addr__rtld_global.  Return true on
+   success, false on failure (which may be cached).  */
+bool __td_ta_rtld_global (td_thragent_t *ta) attribute_hidden;
+
+/* Obtain the address of the list_t fields _dl_stack_user and
+   _dl_stack_used in _rtld_global, or fall back to the global
+   variables of the same name (to support statically linked
+   programs).  */
+td_err_e __td_ta_stack_user (td_thragent_t *ta, psaddr_t *plist)
+  attribute_hidden;
+td_err_e __td_ta_stack_used (td_thragent_t *ta, psaddr_t *plist)
+  attribute_hidden;
+
 #endif /* thread_dbP.h */
diff --git a/sysdeps/aarch64/nptl/tls.h b/sysdeps/aarch64/nptl/tls.h
index e5627f777e..dbef4704f1 100644
--- a/sysdeps/aarch64/nptl/tls.h
+++ b/sysdeps/aarch64/nptl/tls.h
@@ -129,8 +129,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-# define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 # endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/alpha/nptl/tls.h b/sysdeps/alpha/nptl/tls.h
index 82549607fd..f3101f57c9 100644
--- a/sysdeps/alpha/nptl/tls.h
+++ b/sysdeps/alpha/nptl/tls.h
@@ -123,8 +123,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #else /* __ASSEMBLER__ */
 # include <tcb-offsets.h>
diff --git a/sysdeps/arc/nptl/tls.h b/sysdeps/arc/nptl/tls.h
index 184b550ab5..cc5abb1931 100644
--- a/sysdeps/arc/nptl/tls.h
+++ b/sysdeps/arc/nptl/tls.h
@@ -131,8 +131,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* !__ASSEMBLER__ */
 
diff --git a/sysdeps/arm/nptl/tls.h b/sysdeps/arm/nptl/tls.h
index 24a488ff37..91dd7498e4 100644
--- a/sysdeps/arm/nptl/tls.h
+++ b/sysdeps/arm/nptl/tls.h
@@ -120,8 +120,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/csky/nptl/tls.h b/sysdeps/csky/nptl/tls.h
index bcca9674a1..f308773d40 100644
--- a/sysdeps/csky/nptl/tls.h
+++ b/sysdeps/csky/nptl/tls.h
@@ -147,8 +147,6 @@  typedef struct
       atomic_write_barrier ();						      \
     }									      \
   while (0)
-# define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 382eeb9be0..b1da03cafe 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -38,6 +38,7 @@ 
 #include <libc-lock.h>
 #include <hp-timing.h>
 #include <tls.h>
+#include <list_t.h>
 
 __BEGIN_DECLS
 
@@ -461,15 +462,22 @@  struct rtld_global
 
   EXTERN void (*_dl_init_static_tls) (struct link_map *);
 
-  EXTERN void (*_dl_wait_lookup_done) (void);
-
   /* Scopes to free after next THREAD_GSCOPE_WAIT ().  */
   EXTERN struct dl_scope_free_list
   {
     size_t count;
     void *list[50];
   } *_dl_scope_free_list;
-#if !THREAD_GSCOPE_IN_TCB
+#if THREAD_GSCOPE_IN_TCB
+  /* List of active thread stacks, with memory managed by glibc.  */
+  EXTERN list_t _dl_stack_used;
+
+  /* List of thread stacks that were allocated by the application.  */
+  EXTERN list_t _dl_stack_user;
+
+  /* Mutex protecting the stack lists.  */
+  EXTERN int _dl_stack_cache_lock;
+#else
   EXTERN int _dl_thread_gscope_count;
 #endif
 #ifdef SHARED
@@ -1252,6 +1260,11 @@  link_map_audit_state (struct link_map *l, size_t index)
 }
 #endif /* SHARED */
 
+#if THREAD_GSCOPE_IN_TCB
+void __thread_gscope_wait (void) attribute_hidden;
+# define THREAD_GSCOPE_WAIT() __thread_gscope_wait ()
+#endif
+
 __END_DECLS
 
 #endif /* ldsodefs.h */
diff --git a/sysdeps/hppa/nptl/tls.h b/sysdeps/hppa/nptl/tls.h
index 2315340735..f319cb42e2 100644
--- a/sysdeps/hppa/nptl/tls.h
+++ b/sysdeps/hppa/nptl/tls.h
@@ -154,8 +154,6 @@  static inline void __set_cr27(struct pthread *cr27)
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* !__ASSEMBLER__ */
 
diff --git a/sysdeps/i386/nptl/tls.h b/sysdeps/i386/nptl/tls.h
index 5042d52b98..b74347bacd 100644
--- a/sysdeps/i386/nptl/tls.h
+++ b/sysdeps/i386/nptl/tls.h
@@ -387,8 +387,6 @@  tls_fill_user_desc (union user_desc_init *desc,
   while (0)
 #define THREAD_GSCOPE_SET_FLAG() \
   THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/ia64/nptl/tls.h b/sysdeps/ia64/nptl/tls.h
index f5b1684ef3..42b082dad6 100644
--- a/sysdeps/ia64/nptl/tls.h
+++ b/sysdeps/ia64/nptl/tls.h
@@ -175,8 +175,6 @@  register struct pthread *__thread_self __asm__("r13");
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/m68k/nptl/tls.h b/sysdeps/m68k/nptl/tls.h
index 68ea952e79..69e174484e 100644
--- a/sysdeps/m68k/nptl/tls.h
+++ b/sysdeps/m68k/nptl/tls.h
@@ -153,8 +153,6 @@  extern void * __m68k_read_tp (void);
       atomic_write_barrier ();						\
     }									\
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/microblaze/nptl/tls.h b/sysdeps/microblaze/nptl/tls.h
index a094fdb798..78f63019dc 100644
--- a/sysdeps/microblaze/nptl/tls.h
+++ b/sysdeps/microblaze/nptl/tls.h
@@ -136,8 +136,6 @@  typedef struct
       atomic_write_barrier ();                                              \
     }                                                                       \
   while (0)
-# define THREAD_GSCOPE_WAIT() \
-  GL (dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/mips/nptl/tls.h b/sysdeps/mips/nptl/tls.h
index 8b55f19c37..7e7ac43d6c 100644
--- a/sysdeps/mips/nptl/tls.h
+++ b/sysdeps/mips/nptl/tls.h
@@ -178,8 +178,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/nios2/nptl/tls.h b/sysdeps/nios2/nptl/tls.h
index facb27c761..776fe9bb2d 100644
--- a/sysdeps/nios2/nptl/tls.h
+++ b/sysdeps/nios2/nptl/tls.h
@@ -157,8 +157,6 @@  register struct pthread *__thread_self __asm__("r23");
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/nptl/dl-thread_gscope_wait.c b/sysdeps/nptl/dl-thread_gscope_wait.c
new file mode 100644
index 0000000000..a9bfbee24c
--- /dev/null
+++ b/sysdeps/nptl/dl-thread_gscope_wait.c
@@ -0,0 +1,80 @@ 
+/* Out-of-line notification function for the GSCOPE locking mechanism.
+   Copyright (C) 2007-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nptl/descr.h>
+#include <futex-internal.h>
+#include <ldsodefs.h>
+#include <list.h>
+#include <lowlevellock.h>
+
+void
+__thread_gscope_wait (void)
+{
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
+
+  struct pthread *self = THREAD_SELF;
+
+  /* Iterate over the list with system-allocated threads first.  */
+  list_t *runp;
+  list_for_each (runp, &GL (dl_stack_used))
+    {
+      struct pthread *t = list_entry (runp, struct pthread, list);
+      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
+        continue;
+
+      int *const gscope_flagp = &t->header.gscope_flag;
+
+      /* We have to wait until this thread is done with the global
+         scope.  First tell the thread that we are waiting and
+         possibly have to be woken.  */
+      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
+                                                THREAD_GSCOPE_FLAG_WAIT,
+                                                THREAD_GSCOPE_FLAG_USED))
+        continue;
+
+      do
+        futex_wait_simple ((unsigned int *) gscope_flagp,
+                           THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
+      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
+    }
+
+  /* Now the list with threads using user-allocated stacks.  */
+  list_for_each (runp, &GL (dl_stack_user))
+    {
+      struct pthread *t = list_entry (runp, struct pthread, list);
+      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
+        continue;
+
+      int *const gscope_flagp = &t->header.gscope_flag;
+
+      /* We have to wait until this thread is done with the global
+         scope.  First tell the thread that we are waiting and
+         possibly have to be woken.  */
+      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
+                                                THREAD_GSCOPE_FLAG_WAIT,
+                                                THREAD_GSCOPE_FLAG_USED))
+        continue;
+
+      do
+        futex_wait_simple ((unsigned int *) gscope_flagp,
+                           THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
+      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
+    }
+
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
+}
diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
index b1f0b30fbb..261eecfd18 100644
--- a/sysdeps/powerpc/nptl/tls.h
+++ b/sysdeps/powerpc/nptl/tls.h
@@ -245,8 +245,6 @@  register void *__thread_register __asm__ ("r13");
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/riscv/nptl/tls.h b/sysdeps/riscv/nptl/tls.h
index a9167bc143..41d9db10cf 100644
--- a/sysdeps/riscv/nptl/tls.h
+++ b/sysdeps/riscv/nptl/tls.h
@@ -139,8 +139,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-# define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/s390/nptl/tls.h b/sysdeps/s390/nptl/tls.h
index 4a9b3570eb..7653109617 100644
--- a/sysdeps/s390/nptl/tls.h
+++ b/sysdeps/s390/nptl/tls.h
@@ -185,8 +185,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/sh/nptl/tls.h b/sysdeps/sh/nptl/tls.h
index 6ccad251f9..2d5e3731b2 100644
--- a/sysdeps/sh/nptl/tls.h
+++ b/sysdeps/sh/nptl/tls.h
@@ -161,8 +161,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/sparc/nptl/tls.h b/sysdeps/sparc/nptl/tls.h
index 18a548e86b..b08a156131 100644
--- a/sysdeps/sparc/nptl/tls.h
+++ b/sysdeps/sparc/nptl/tls.h
@@ -158,8 +158,6 @@  register struct pthread *__thread_self __asm__("%g7");
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* !ASSEMBLER */
 
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index fbd7f9cb89..a08bf972de 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -332,8 +332,6 @@  _Static_assert (offsetof (tcbhead_t, __glibc_unused2) == 0x80,
   while (0)
 # define THREAD_GSCOPE_SET_FLAG() \
   THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
-# define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */