diff mbox series

nptl: Move stack list variables into _rtld_global

Message ID 87a6vlthqn.fsf@oldenburg2.str.redhat.com
State New
Headers show
Series nptl: Move stack list variables into _rtld_global | expand

Commit Message

Florian Weimer Nov. 13, 2020, 3:10 p.m. UTC
Now __thread_gscope_wait (the function behind THREAD_GSCOPE_WAIT,
formerly __wait_lookup_done) can be implemented directly in ld.so,
eliminating the unprotected GL (dl_wait_lookup_done) function
pointer.

---
 csu/libc-tls.c                       |   7 ++
 elf/Makefile                         |   6 +-
 elf/dl-support.c                     |   9 ++-
 elf/dl-thread_gscope_wait.c          |   2 +
 elf/rtld.c                           |  12 +++
 nptl/allocatestack.c                 | 147 +++++++++--------------------------
 nptl/descr.h                         |   3 +-
 nptl/nptl-init.c                     |  11 +--
 nptl/pthreadP.h                      |   4 -
 nptl/pthread_create.c                |   8 +-
 nptl_db/structs.def                  |   4 +-
 nptl_db/td_init.c                    |  14 ++++
 nptl_db/td_ta_map_lwp2thr.c          |   2 +-
 nptl_db/td_ta_thr_iter.c             |   4 +-
 nptl_db/td_thr_tlsbase.c             |   7 +-
 nptl_db/td_thr_validate.c            |  39 +++++++++-
 nptl_db/thread_dbP.h                 |  13 ++++
 sysdeps/aarch64/nptl/tls.h           |   2 -
 sysdeps/alpha/nptl/tls.h             |   2 -
 sysdeps/arc/nptl/tls.h               |   2 -
 sysdeps/arm/nptl/tls.h               |   2 -
 sysdeps/csky/nptl/tls.h              |   2 -
 sysdeps/generic/ldsodefs.h           |  19 ++++-
 sysdeps/hppa/nptl/tls.h              |   2 -
 sysdeps/i386/nptl/tls.h              |   2 -
 sysdeps/ia64/nptl/tls.h              |   2 -
 sysdeps/m68k/nptl/tls.h              |   2 -
 sysdeps/microblaze/nptl/tls.h        |   2 -
 sysdeps/mips/nptl/tls.h              |   2 -
 sysdeps/nios2/nptl/tls.h             |   2 -
 sysdeps/nptl/dl-thread_gscope_wait.c |  80 +++++++++++++++++++
 sysdeps/powerpc/nptl/tls.h           |   2 -
 sysdeps/riscv/nptl/tls.h             |   2 -
 sysdeps/s390/nptl/tls.h              |   2 -
 sysdeps/sh/nptl/tls.h                |   2 -
 sysdeps/sparc/nptl/tls.h             |   2 -
 sysdeps/x86_64/nptl/tls.h            |   2 -
 37 files changed, 243 insertions(+), 184 deletions(-)

Comments

Adhemerval Zanella Netto Nov. 16, 2020, 6:02 p.m. UTC | #1
On 13/11/2020 12:10, Florian Weimer via Libc-alpha wrote:
> Now __thread_gscope_wait (the function behind THREAD_GSCOPE_WAIT,
> formerly __wait_lookup_done) can be implemented directly in ld.so,
> eliminating the unprotected GL (dl_wait_lookup_done) function
> pointer.

LGTM, thanks.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> 
> ---
>  csu/libc-tls.c                       |   7 ++
>  elf/Makefile                         |   6 +-
>  elf/dl-support.c                     |   9 ++-
>  elf/dl-thread_gscope_wait.c          |   2 +
>  elf/rtld.c                           |  12 +++
>  nptl/allocatestack.c                 | 147 +++++++++--------------------------
>  nptl/descr.h                         |   3 +-
>  nptl/nptl-init.c                     |  11 +--
>  nptl/pthreadP.h                      |   4 -
>  nptl/pthread_create.c                |   8 +-
>  nptl_db/structs.def                  |   4 +-
>  nptl_db/td_init.c                    |  14 ++++
>  nptl_db/td_ta_map_lwp2thr.c          |   2 +-
>  nptl_db/td_ta_thr_iter.c             |   4 +-
>  nptl_db/td_thr_tlsbase.c             |   7 +-
>  nptl_db/td_thr_validate.c            |  39 +++++++++-
>  nptl_db/thread_dbP.h                 |  13 ++++
>  sysdeps/aarch64/nptl/tls.h           |   2 -
>  sysdeps/alpha/nptl/tls.h             |   2 -
>  sysdeps/arc/nptl/tls.h               |   2 -
>  sysdeps/arm/nptl/tls.h               |   2 -
>  sysdeps/csky/nptl/tls.h              |   2 -
>  sysdeps/generic/ldsodefs.h           |  19 ++++-
>  sysdeps/hppa/nptl/tls.h              |   2 -
>  sysdeps/i386/nptl/tls.h              |   2 -
>  sysdeps/ia64/nptl/tls.h              |   2 -
>  sysdeps/m68k/nptl/tls.h              |   2 -
>  sysdeps/microblaze/nptl/tls.h        |   2 -
>  sysdeps/mips/nptl/tls.h              |   2 -
>  sysdeps/nios2/nptl/tls.h             |   2 -
>  sysdeps/nptl/dl-thread_gscope_wait.c |  80 +++++++++++++++++++
>  sysdeps/powerpc/nptl/tls.h           |   2 -
>  sysdeps/riscv/nptl/tls.h             |   2 -
>  sysdeps/s390/nptl/tls.h              |   2 -
>  sysdeps/sh/nptl/tls.h                |   2 -
>  sysdeps/sparc/nptl/tls.h             |   2 -
>  sysdeps/x86_64/nptl/tls.h            |   2 -
>  37 files changed, 243 insertions(+), 184 deletions(-)
> 
> diff --git a/csu/libc-tls.c b/csu/libc-tls.c
> index 06e76bd395..c3589f0a7d 100644
> --- a/csu/libc-tls.c
> +++ b/csu/libc-tls.c
> @@ -24,6 +24,7 @@
>  #include <stdio.h>
>  #include <sys/param.h>
>  #include <array_length.h>
> +#include <list.h>
>  
>  #ifdef SHARED
>   #error makefile bug, this file is for static only
> @@ -193,6 +194,12 @@ __libc_setup_tls (void)
>    if (__builtin_expect (lossage != NULL, 0))
>      _startup_fatal (lossage);
>  
> +#if THREAD_GSCOPE_IN_TCB
> +  INIT_LIST_HEAD (&_dl_stack_used);
> +  INIT_LIST_HEAD (&_dl_stack_user);
> +  list_add (&THREAD_SELF->list, &_dl_stack_user);
> +#endif
> +
>    /* Update the executable's link map with enough information to make
>       the TLS routines happy.  */
>    main_map->l_tls_align = align;

Ok.

> diff --git a/elf/Makefile b/elf/Makefile
> index f10cc59e7c..2015383eb2 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -34,7 +34,8 @@ dl-routines	= $(addprefix dl-,load lookup object reloc deps \
>  				  version profile tls origin scope \
>  				  execstack open close trampoline \
>  				  exception sort-maps lookup-direct \
> -				  call-libc-early-init write)
> +				  call-libc-early-init write \
> +				  thread_gscope_wait)
>  ifeq (yes,$(use-ldconfig))
>  dl-routines += dl-cache
>  endif

Ok.

> @@ -54,7 +55,8 @@ endif
>  all-dl-routines = $(dl-routines) $(sysdep-dl-routines)
>  # But they are absent from the shared libc, because that code is in ld.so.
>  elide-routines.os = $(all-dl-routines) dl-support enbl-secure dl-origin \
> -		    dl-sysdep dl-exception dl-reloc-static-pie
> +		    dl-sysdep dl-exception dl-reloc-static-pie \
> +		    thread_gscope_wait
>  
>  # ld.so uses those routines, plus some special stuff for being the program
>  # interpreter and operating independent of libc.

Ok.

> diff --git a/elf/dl-support.c b/elf/dl-support.c
> index afbc94df54..15e6d787c9 100644
> --- a/elf/dl-support.c
> +++ b/elf/dl-support.c
> @@ -183,10 +183,11 @@ ElfW(Word) _dl_stack_flags = DEFAULT_STACK_PERMS;
>  int (*_dl_make_stack_executable_hook) (void **) = _dl_make_stack_executable;
>  
>  
> -/* Function in libpthread to wait for termination of lookups.  */
> -void (*_dl_wait_lookup_done) (void);
> -
> -#if !THREAD_GSCOPE_IN_TCB
> +#if THREAD_GSCOPE_IN_TCB
> +list_t _dl_stack_used;
> +list_t _dl_stack_user;
> +int _dl_stack_cache_lock;
> +#else
>  int _dl_thread_gscope_count;
>  #endif
>  struct dl_scope_free_list *_dl_scope_free_list;

Ok.

> diff --git a/elf/dl-thread_gscope_wait.c b/elf/dl-thread_gscope_wait.c
> new file mode 100644
> index 0000000000..28e1494730
> --- /dev/null
> +++ b/elf/dl-thread_gscope_wait.c
> @@ -0,0 +1,2 @@
> +/* By default, the dynamic linker does not use an out-of-line
> +   __thread_gscope_wait function.  */

Ok.

> diff --git a/elf/rtld.c b/elf/rtld.c
> index 5d117d0d2c..c4ffc8d4b7 100644
> --- a/elf/rtld.c
> +++ b/elf/rtld.c
> @@ -48,6 +48,7 @@
>  #include <array_length.h>
>  #include <libc-early-init.h>
>  #include <dl-main.h>
> +#include <list.h>
>  
>  #include <assert.h>
>  
> @@ -799,6 +800,9 @@ cannot allocate TLS data structures for initial thread\n");
>    const char *lossage = TLS_INIT_TP (tcbp);
>    if (__glibc_unlikely (lossage != NULL))
>      _dl_fatal_printf ("cannot set up thread-local storage: %s\n", lossage);
> +#if THREAD_GSCOPE_IN_TCB
> +  list_add (&THREAD_SELF->list, &GL (dl_stack_user));
> +#endif
>    tls_init_tp_called = true;
>  
>    return tcbp;

Ok.

> @@ -1139,6 +1143,11 @@ dl_main (const ElfW(Phdr) *phdr,
>    GL(dl_rtld_unlock_recursive) = rtld_lock_default_unlock_recursive;
>  #endif
>  
> +#if THREAD_GSCOPE_IN_TCB
> +  INIT_LIST_HEAD (&GL (dl_stack_used));
> +  INIT_LIST_HEAD (&GL (dl_stack_user));
> +#endif
> +
>    /* The explicit initialization here is cheaper than processing the reloc
>       in the _rtld_local definition's initializer.  */
>    GL(dl_make_stack_executable_hook) = &_dl_make_stack_executable;

Ok.

> @@ -2383,6 +2392,9 @@ dl_main (const ElfW(Phdr) *phdr,
>        if (__glibc_unlikely (lossage != NULL))
>  	_dl_fatal_printf ("cannot set up thread-local storage: %s\n",
>  			  lossage);
> +#if THREAD_GSCOPE_IN_TCB
> +      list_add (&THREAD_SELF->list, &GL (dl_stack_user));
> +#endif
>      }
>  
>    /* Make sure no new search directories have been added.  */

Ok.

> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index 4b45f8c884..b7f9eeebf6 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -106,26 +106,14 @@
>  static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
>  static size_t stack_cache_actsize;
>  
> -/* Mutex protecting this variable.  */
> -static int stack_cache_lock = LLL_LOCK_INITIALIZER;
> -
>  /* List of queued stack frames.  */
>  static LIST_HEAD (stack_cache);
>  
> -/* List of the stacks in use.  */
> -static LIST_HEAD (stack_used);
> -
>  /* We need to record what list operations we are going to do so that,
>     in case of an asynchronous interruption due to a fork() call, we
>     can correct for the work.  */
>  static uintptr_t in_flight_stack;
>  
> -/* List of the threads with user provided stacks in use.  No need to
> -   initialize this, since it's done in __pthread_initialize_minimal.  */
> -list_t __stack_user __attribute__ ((nocommon));
> -hidden_data_def (__stack_user)
> -
> -
>  /* Check whether the stack is still used or not.  */
>  #define FREE_P(descr) ((descr)->tid <= 0)
>  

Ok.

> @@ -173,7 +161,7 @@ get_cached_stack (size_t *sizep, void **memp)
>    struct pthread *result = NULL;
>    list_t *entry;
>  
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    /* Search the cache for a matching entry.  We search for the
>       smallest stack which has at least the required size.  Note that

Ok.

> @@ -206,7 +194,7 @@ get_cached_stack (size_t *sizep, void **memp)
>        || __builtin_expect (result->stackblock_size > 4 * size, 0))
>      {
>        /* Release the lock.  */
> -      lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>        return NULL;
>      }

Ok.

> @@ -218,13 +206,13 @@ get_cached_stack (size_t *sizep, void **memp)
>    stack_list_del (&result->list);
>  
>    /* And add to the list of stacks in use.  */
> -  stack_list_add (&result->list, &stack_used);
> +  stack_list_add (&result->list, &GL (dl_stack_used));
>  
>    /* And decrease the cache size.  */
>    stack_cache_actsize -= result->stackblock_size;
>  
>    /* Release the lock early.  */
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    /* Report size and location of the stack to the caller.  */
>    *sizep = result->stackblock_size;

Ok.

> @@ -510,12 +498,12 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  
>  
>        /* Prepare to modify global data.  */
> -      lll_lock (stack_cache_lock, LLL_PRIVATE);
> +      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>        /* And add to the list of stacks in use.  */
> -      list_add (&pd->list, &__stack_user);
> +      list_add (&pd->list, &GL (dl_stack_user));
>  
> -      lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>      }
>    else
>      {

Ok.

> @@ -644,12 +632,12 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  
>  
>  	  /* Prepare to modify global data.  */
> -	  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +	  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>  	  /* And add to the list of stacks in use.  */
> -	  stack_list_add (&pd->list, &stack_used);
> +	  stack_list_add (&pd->list, &GL (dl_stack_used));
>  
> -	  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +	  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>  
>  	  /* There might have been a race.  Another thread might have

Ok.

> @@ -690,12 +678,12 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  	  if (__mprotect (guard, guardsize, PROT_NONE) != 0)
>  	    {
>  	    mprot_error:
> -	      lll_lock (stack_cache_lock, LLL_PRIVATE);
> +	      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>  	      /* Remove the thread from the list.  */
>  	      stack_list_del (&pd->list);
>  
> -	      lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +	      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>  	      /* Get rid of the TLS block we allocated.  */
>  	      _dl_deallocate_tls (TLS_TPADJ (pd), false);

Ok.

> @@ -799,7 +787,7 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  void
>  __deallocate_stack (struct pthread *pd)
>  {
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    /* Remove the thread from the list of threads with user defined
>       stacks.  */
> @@ -815,7 +803,7 @@ __deallocate_stack (struct pthread *pd)
>      /* Free the memory associated with the ELF TLS.  */
>      _dl_deallocate_tls (TLS_TPADJ (pd), false);
>  
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  }
>  
>  

Ok.

> @@ -831,10 +819,10 @@ __make_stacks_executable (void **stack_endp)
>    const size_t pagemask = ~(__getpagesize () - 1);
>  #endif
>  
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    list_t *runp;
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      {
>        err = change_stack_perm (list_entry (runp, struct pthread, list)
>  #ifdef NEED_SEPARATE_REGISTER_STACK

Ok.

> @@ -860,7 +848,7 @@ __make_stacks_executable (void **stack_endp)
>  	  break;
>        }
>  
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    return err;
>  }
> @@ -891,8 +879,8 @@ __reclaim_stacks (void)
>  	     pointers at the head of the list are inconsistent.  */
>  	  list_t *l = NULL;
>  
> -	  if (stack_used.next->prev != &stack_used)
> -	    l = &stack_used;
> +	  if (GL (dl_stack_used).next->prev != &GL (dl_stack_used))
> +	    l = &GL (dl_stack_used);
>  	  else if (stack_cache.next->prev != &stack_cache)
>  	    l = &stack_cache;
>  

Ok.

> @@ -914,7 +902,7 @@ __reclaim_stacks (void)
>  
>    /* Mark all stacks except the still running one as free.  */
>    list_t *runp;
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      {
>        struct pthread *curp = list_entry (runp, struct pthread, list);
>        if (curp != self)

Ok.

> @@ -948,7 +936,7 @@ __reclaim_stacks (void)
>      }
>  
>    /* Add the stack of all running threads to the cache.  */
> -  list_splice (&stack_used, &stack_cache);
> +  list_splice (&GL (dl_stack_used), &stack_cache);
>  
>    /* Remove the entry for the current thread to from the cache list
>       and add it to the list of running threads.  Which of the two

Ok.

> @@ -956,13 +944,13 @@ __reclaim_stacks (void)
>    stack_list_del (&self->list);
>  
>    /* Re-initialize the lists for all the threads.  */
> -  INIT_LIST_HEAD (&stack_used);
> -  INIT_LIST_HEAD (&__stack_user);
> +  INIT_LIST_HEAD (&GL (dl_stack_used));
> +  INIT_LIST_HEAD (&GL (dl_stack_user));
>  
>    if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
> -    list_add (&self->list, &__stack_user);
> +    list_add (&self->list, &GL (dl_stack_user));
>    else
> -    list_add (&self->list, &stack_used);
> +    list_add (&self->list, &GL (dl_stack_used));
>  
>    /* There is one thread running.  */
>    __nptl_nthreads = 1;

Ok.

> @@ -970,7 +958,7 @@ __reclaim_stacks (void)
>    in_flight_stack = 0;
>  
>    /* Initialize locks.  */
> -  stack_cache_lock = LLL_LOCK_INITIALIZER;
> +  GL (dl_stack_cache_lock) = LLL_LOCK_INITIALIZER;
>    __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
>  }
>  

Ok.

> @@ -1083,7 +1071,7 @@ __nptl_setxid (struct xid_command *cmdp)
>  {
>    int signalled;
>    int result;
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    __xidcmd = cmdp;
>    cmdp->cntr = 0;
> @@ -1093,7 +1081,7 @@ __nptl_setxid (struct xid_command *cmdp)
>  
>    /* Iterate over the list with system-allocated threads first.  */
>    list_t *runp;
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      {
>        struct pthread *t = list_entry (runp, struct pthread, list);
>        if (t == self)
> @@ -1103,7 +1091,7 @@ __nptl_setxid (struct xid_command *cmdp)
>      }
>  
>    /* Now the list with threads using user-allocated stacks.  */
> -  list_for_each (runp, &__stack_user)
> +  list_for_each (runp, &GL (dl_stack_user))
>      {
>        struct pthread *t = list_entry (runp, struct pthread, list);
>        if (t == self)
> @@ -1119,7 +1107,7 @@ __nptl_setxid (struct xid_command *cmdp)
>      {
>        signalled = 0;
>  
> -      list_for_each (runp, &stack_used)
> +      list_for_each (runp, &GL (dl_stack_used))
>  	{
>  	  struct pthread *t = list_entry (runp, struct pthread, list);
>  	  if (t == self)
> @@ -1128,7 +1116,7 @@ __nptl_setxid (struct xid_command *cmdp)
>  	  signalled += setxid_signal_thread (cmdp, t);
>  	}
>  
> -      list_for_each (runp, &__stack_user)
> +      list_for_each (runp, &GL (dl_stack_user))
>  	{
>  	  struct pthread *t = list_entry (runp, struct pthread, list);
>  	  if (t == self)
> @@ -1149,7 +1137,7 @@ __nptl_setxid (struct xid_command *cmdp)
>  
>    /* Clean up flags, so that no thread blocks during exit waiting
>       for a signal which will never come.  */
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      {
>        struct pthread *t = list_entry (runp, struct pthread, list);
>        if (t == self)
> @@ -1158,7 +1146,7 @@ __nptl_setxid (struct xid_command *cmdp)
>        setxid_unmark_thread (cmdp, t);
>      }
>  
> -  list_for_each (runp, &__stack_user)
> +  list_for_each (runp, &GL (dl_stack_user))
>      {
>        struct pthread *t = list_entry (runp, struct pthread, list);
>        if (t == self)
> @@ -1180,7 +1168,7 @@ __nptl_setxid (struct xid_command *cmdp)
>      }
>    __nptl_setxid_error (cmdp, error);
>  
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>    return result;
>  }
>  

Ok.

> @@ -1204,75 +1192,16 @@ void
>  attribute_hidden
>  __pthread_init_static_tls (struct link_map *map)
>  {
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    /* Iterate over the list with system-allocated threads first.  */
>    list_t *runp;
> -  list_for_each (runp, &stack_used)
> +  list_for_each (runp, &GL (dl_stack_used))
>      init_one_static_tls (list_entry (runp, struct pthread, list), map);
>  
>    /* Now the list with threads using user-allocated stacks.  */
> -  list_for_each (runp, &__stack_user)
> +  list_for_each (runp, &GL (dl_stack_user))
>      init_one_static_tls (list_entry (runp, struct pthread, list), map);
>  

Ok.

> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> -}
> -
> -
> -void
> -attribute_hidden
> -__wait_lookup_done (void)
> -{
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> -
> -  struct pthread *self = THREAD_SELF;
> -
> -  /* Iterate over the list with system-allocated threads first.  */
> -  list_t *runp;
> -  list_for_each (runp, &stack_used)
> -    {
> -      struct pthread *t = list_entry (runp, struct pthread, list);
> -      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
> -	continue;
> -
> -      int *const gscope_flagp = &t->header.gscope_flag;
> -
> -      /* We have to wait until this thread is done with the global
> -	 scope.  First tell the thread that we are waiting and
> -	 possibly have to be woken.  */
> -      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
> -						THREAD_GSCOPE_FLAG_WAIT,
> -						THREAD_GSCOPE_FLAG_USED))
> -	continue;
> -
> -      do
> -	futex_wait_simple ((unsigned int *) gscope_flagp,
> -			   THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
> -      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
> -    }
> -
> -  /* Now the list with threads using user-allocated stacks.  */
> -  list_for_each (runp, &__stack_user)
> -    {
> -      struct pthread *t = list_entry (runp, struct pthread, list);
> -      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
> -	continue;
> -
> -      int *const gscope_flagp = &t->header.gscope_flag;
> -
> -      /* We have to wait until this thread is done with the global
> -	 scope.  First tell the thread that we are waiting and
> -	 possibly have to be woken.  */
> -      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
> -						THREAD_GSCOPE_FLAG_WAIT,
> -						THREAD_GSCOPE_FLAG_USED))
> -	continue;
> -
> -      do
> -	futex_wait_simple ((unsigned int *) gscope_flagp,
> -			   THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
> -      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
> -    }
> -
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  }

Ok.

> diff --git a/nptl/descr.h b/nptl/descr.h
> index d8343ff9a1..b172ee408b 100644
> --- a/nptl/descr.h
> +++ b/nptl/descr.h
> @@ -162,7 +162,8 @@ struct pthread
>      void *__padding[24];
>    };
>  
> -  /* This descriptor's link on the `stack_used' or `__stack_user' list.  */
> +  /* This descriptor's link on the GL (dl_stack_used) or
> +     GL (dl_stack_user) list.  */
>    list_t list;
>  
>    /* Thread ID - which is also a 'is this thread descriptor (and

Ok.

> diff --git a/nptl/nptl-init.c b/nptl/nptl-init.c
> index 4aa1231bec..53b817715d 100644
> --- a/nptl/nptl-init.c
> +++ b/nptl/nptl-init.c
> @@ -251,12 +251,9 @@ __pthread_initialize_minimal_internal (void)
>       purposes this is good enough.  */
>    THREAD_SETMEM (pd, stackblock_size, (size_t) __libc_stack_end);
>  
> -  /* Initialize the list of all running threads with the main thread.  */
> -  INIT_LIST_HEAD (&__stack_user);
> -  list_add (&pd->list, &__stack_user);
> -
> -  /* Before initializing __stack_user, the debugger could not find us and
> -     had to set __nptl_initial_report_events.  Propagate its setting.  */
> +  /* Before initializing GL (dl_stack_user), the debugger could not
> +     find us and had to set __nptl_initial_report_events.  Propagate
> +     its setting.  */
>    THREAD_SETMEM (pd, report_events, __nptl_initial_report_events);
>  
>    struct sigaction sa;

Ok.

> @@ -336,8 +333,6 @@ __pthread_initialize_minimal_internal (void)
>  
>    GL(dl_init_static_tls) = &__pthread_init_static_tls;
>  
> -  GL(dl_wait_lookup_done) = &__wait_lookup_done;
> -
>    /* Register the fork generation counter with the libc.  */

Ok.

>  #ifndef TLS_MULTIPLE_THREADS_IN_TCB
>    __libc_multiple_threads_ptr =
> diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
> index 686f54aa93..a7510f9f63 100644
> --- a/nptl/pthreadP.h
> +++ b/nptl/pthreadP.h
> @@ -208,10 +208,6 @@ extern void __default_pthread_attr_freeres (void) attribute_hidden;
>  extern size_t __static_tls_size attribute_hidden;
>  extern size_t __static_tls_align_m1 attribute_hidden;
>  
> -/* Thread descriptor handling.  */
> -extern list_t __stack_user;
> -hidden_proto (__stack_user)
> -
>  /* Attribute handling.  */
>  extern struct pthread_attr *__attr_list attribute_hidden;
>  extern int __attr_list_lock attribute_hidden;

Ok.

> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 447f005ece..bad4e57a84 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -213,9 +213,9 @@ __find_in_stack_list (struct pthread *pd)
>    list_t *entry;
>    struct pthread *result = NULL;
>  
> -  lll_lock (stack_cache_lock, LLL_PRIVATE);
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
> -  list_for_each (entry, &stack_used)
> +  list_for_each (entry, &GL (dl_stack_used))
>      {
>        struct pthread *curp;
>  
> @@ -228,7 +228,7 @@ __find_in_stack_list (struct pthread *pd)
>      }
>  
>    if (result == NULL)
> -    list_for_each (entry, &__stack_user)
> +    list_for_each (entry, &GL (dl_stack_user))
>        {
>  	struct pthread *curp;
>  
> @@ -240,7 +240,7 @@ __find_in_stack_list (struct pthread *pd)
>  	  }
>        }
>  
> -  lll_unlock (stack_cache_lock, LLL_PRIVATE);
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
>  
>    return result;
>  }

Ok.

> diff --git a/nptl_db/structs.def b/nptl_db/structs.def
> index 472e9008fa..2ca33598b1 100644
> --- a/nptl_db/structs.def
> +++ b/nptl_db/structs.def
> @@ -69,8 +69,6 @@ DB_STRUCT (td_eventbuf_t)
>  DB_STRUCT_FIELD (td_eventbuf_t, eventnum)
>  DB_STRUCT_FIELD (td_eventbuf_t, eventdata)
>  
> -DB_SYMBOL (stack_used)
> -DB_SYMBOL (__stack_user)
>  DB_SYMBOL (nptl_version)
>  DB_FUNCTION (__nptl_create_event)
>  DB_FUNCTION (__nptl_death_event)
> @@ -106,6 +104,8 @@ DB_STRUCT (rtld_global)
>  DB_RTLD_VARIABLE (_rtld_global)
>  #endif
>  DB_RTLD_GLOBAL_FIELD (dl_tls_dtv_slotinfo_list)
> +DB_RTLD_GLOBAL_FIELD (dl_stack_user)
> +DB_RTLD_GLOBAL_FIELD (dl_stack_used)
>  
>  DB_STRUCT (dtv_slotinfo_list)
>  DB_STRUCT_FIELD (dtv_slotinfo_list, len)

Ok.

> diff --git a/nptl_db/td_init.c b/nptl_db/td_init.c
> index 86773dcb06..79a1f90312 100644
> --- a/nptl_db/td_init.c
> +++ b/nptl_db/td_init.c
> @@ -29,3 +29,17 @@ td_init (void)
>    LOG ("td_init");
>    return TD_OK;
>  }
> +
> +bool
> +__td_ta_rtld_global (td_thragent_t *ta)
> +{
> +  if (ta->ta_addr__rtld_global == 0
> +      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
> +                        &ta->ta_addr__rtld_global) != PS_OK)
> +    {
> +      ta->ta_addr__rtld_global = (void*)-1;
> +      return false;
> +    }
> +  else
> +    return ta->ta_addr__rtld_global != (void*)-1;
> +}

ok.

> diff --git a/nptl_db/td_ta_map_lwp2thr.c b/nptl_db/td_ta_map_lwp2thr.c
> index d04d4bec23..81ff48808c 100644
> --- a/nptl_db/td_ta_map_lwp2thr.c
> +++ b/nptl_db/td_ta_map_lwp2thr.c
> @@ -187,7 +187,7 @@ td_ta_map_lwp2thr (const td_thragent_t *ta_arg,
>       fake a special descriptor for the initial thread.  */
>  
>    psaddr_t list;
> -  td_err_e err = DB_GET_SYMBOL (list, ta, __stack_user);
> +  td_err_e err = __td_ta_stack_user (ta, &list);
>    if (err != TD_OK)
>      return err;
>  

Ok.

> diff --git a/nptl_db/td_ta_thr_iter.c b/nptl_db/td_ta_thr_iter.c
> index d59782bd6e..e406bcbd39 100644
> --- a/nptl_db/td_ta_thr_iter.c
> +++ b/nptl_db/td_ta_thr_iter.c
> @@ -133,14 +133,14 @@ td_ta_thr_iter (const td_thragent_t *ta_arg, td_thr_iter_f *callback,
>       have to iterate over both lists separately.  We start with the
>       list of threads with user-defined stacks.  */
>  
> -  err = DB_GET_SYMBOL (list, ta, __stack_user);
> +  err = __td_ta_stack_user (ta, &list);
>    if (err == TD_OK)
>      err = iterate_thread_list (ta, callback, cbdata_p, state, ti_pri,
>  			       list, true);
>  
>    /* And the threads with stacks allocated by the implementation.  */
>    if (err == TD_OK)
> -    err = DB_GET_SYMBOL (list, ta, stack_used);
> +    err = __td_ta_stack_used (ta, &list);
>    if (err == TD_OK)
>      err = iterate_thread_list (ta, callback, cbdata_p, state, ti_pri,
>  			       list, false);

Ok.

> diff --git a/nptl_db/td_thr_tlsbase.c b/nptl_db/td_thr_tlsbase.c
> index 8198934112..eeba0a04d8 100644
> --- a/nptl_db/td_thr_tlsbase.c
> +++ b/nptl_db/td_thr_tlsbase.c
> @@ -28,12 +28,7 @@ dtv_slotinfo_list (td_thragent_t *ta,
>    td_err_e err;
>    psaddr_t head;
>  
> -  if (ta->ta_addr__rtld_global == 0
> -      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
> -			&ta->ta_addr__rtld_global) != PS_OK)
> -    ta->ta_addr__rtld_global = (void*)-1;
> -
> -  if (ta->ta_addr__rtld_global != (void*)-1)
> +  if (__td_ta_rtld_global (ta))
>      {
>        err = DB_GET_FIELD (head, ta, ta->ta_addr__rtld_global,
>  			  rtld_global, _dl_tls_dtv_slotinfo_list, 0);
> diff --git a/nptl_db/td_thr_validate.c b/nptl_db/td_thr_validate.c

Ok.

> index c709c5e9be..d2f4107cf2 100644
> --- a/nptl_db/td_thr_validate.c
> +++ b/nptl_db/td_thr_validate.c
> @@ -20,6 +20,41 @@
>  #include "thread_dbP.h"
>  #include <stdbool.h>
>  
> +td_err_e
> +__td_ta_stack_user (td_thragent_t *ta, psaddr_t *plist)
> +{
> +  if (__td_ta_rtld_global (ta))
> +    return DB_GET_FIELD_ADDRESS (*plist, ta, ta->ta_addr__rtld_global,
> +				 rtld_global, _dl_stack_user, 0);
> +  else
> +    {
> +      if (ta->ta_addr__dl_stack_user == 0
> +	  && td_mod_lookup (ta->ph, NULL, SYM__dl_stack_user,
> +			    &ta->ta_addr__dl_stack_user) != PS_OK)
> +	return TD_ERR;
> +      *plist = ta->ta_addr__dl_stack_user;
> +      return TD_OK;
> +    }
> +}
> +
> +td_err_e
> +__td_ta_stack_used (td_thragent_t *ta, psaddr_t *plist)
> +{
> +
> +  if (__td_ta_rtld_global (ta))
> +    return DB_GET_FIELD_ADDRESS (*plist, ta, ta->ta_addr__rtld_global,
> +				 rtld_global, _dl_stack_used, 0);
> +  else
> +    {
> +      if (ta->ta_addr__dl_stack_used == 0
> +	  && td_mod_lookup (ta->ph, NULL, SYM__dl_stack_used,
> +			    &ta->ta_addr__dl_stack_used) != PS_OK)
> +	return TD_ERR;
> +      *plist = ta->ta_addr__dl_stack_used;
> +      return TD_OK;
> +    }
> +}
> +
>  static td_err_e
>  check_thread_list (const td_thrhandle_t *th, psaddr_t head, bool *uninit)
>  {

Ok.

> @@ -62,7 +97,7 @@ td_thr_validate (const td_thrhandle_t *th)
>  
>    /* First check the list with threads using user allocated stacks.  */
>    bool uninit = false;
> -  err = DB_GET_SYMBOL (list, th->th_ta_p, __stack_user);
> +  err = __td_ta_stack_user (th->th_ta_p, &list);
>    if (err == TD_OK)
>      err = check_thread_list (th, list, &uninit);
>  
> @@ -70,7 +105,7 @@ td_thr_validate (const td_thrhandle_t *th)
>       using implementation allocated stacks.  */
>    if (err == TD_NOTHR)
>      {
> -      err = DB_GET_SYMBOL (list, th->th_ta_p, stack_used);
> +      err = __td_ta_stack_used (th->th_ta_p, &list);
>        if (err == TD_OK)
>  	err = check_thread_list (th, list, &uninit);
>  

Ok.

> diff --git a/nptl_db/thread_dbP.h b/nptl_db/thread_dbP.h
> index 8a666b8d4f..9574e68d70 100644
> --- a/nptl_db/thread_dbP.h
> +++ b/nptl_db/thread_dbP.h
> @@ -269,4 +269,17 @@ extern td_err_e _td_check_sizeof (td_thragent_t *ta, uint32_t *sizep,
>  extern td_err_e __td_ta_lookup_th_unique (const td_thragent_t *ta,
>  					  lwpid_t lwpid, td_thrhandle_t *th);
>  
> +/* Try to initialize TA->ta_addr__rtld_global.  Return true on
> +   success, false on failure (which may be cached).  */
> +bool __td_ta_rtld_global (td_thragent_t *ta) attribute_hidden;
> +
> +/* Obtain the address of the list_t fields _dl_stack_user and
> +   _dl_stack_used in _rtld_global, or fall back to the global
> +   variables of the same name (to support statically linked
> +   programs).  */
> +td_err_e __td_ta_stack_user (td_thragent_t *ta, psaddr_t *plist)
> +  attribute_hidden;
> +td_err_e __td_ta_stack_used (td_thragent_t *ta, psaddr_t *plist)
> +  attribute_hidden;
> +
>  #endif /* thread_dbP.h */

Ok.

> diff --git a/sysdeps/aarch64/nptl/tls.h b/sysdeps/aarch64/nptl/tls.h
> index e5627f777e..dbef4704f1 100644
> --- a/sysdeps/aarch64/nptl/tls.h
> +++ b/sysdeps/aarch64/nptl/tls.h
> @@ -129,8 +129,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  # endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/alpha/nptl/tls.h b/sysdeps/alpha/nptl/tls.h
> index 82549607fd..f3101f57c9 100644
> --- a/sysdeps/alpha/nptl/tls.h
> +++ b/sysdeps/alpha/nptl/tls.h
> @@ -123,8 +123,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #else /* __ASSEMBLER__ */
>  # include <tcb-offsets.h>

Ok.

> diff --git a/sysdeps/arc/nptl/tls.h b/sysdeps/arc/nptl/tls.h
> index 184b550ab5..cc5abb1931 100644
> --- a/sysdeps/arc/nptl/tls.h
> +++ b/sysdeps/arc/nptl/tls.h
> @@ -131,8 +131,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* !__ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/arm/nptl/tls.h b/sysdeps/arm/nptl/tls.h
> index 24a488ff37..91dd7498e4 100644
> --- a/sysdeps/arm/nptl/tls.h
> +++ b/sysdeps/arm/nptl/tls.h
> @@ -120,8 +120,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  


Ok.

> diff --git a/sysdeps/csky/nptl/tls.h b/sysdeps/csky/nptl/tls.h
> index bcca9674a1..f308773d40 100644
> --- a/sysdeps/csky/nptl/tls.h
> +++ b/sysdeps/csky/nptl/tls.h
> @@ -147,8 +147,6 @@ typedef struct
>        atomic_write_barrier ();						      \
>      }									      \
>    while (0)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  


Ok.

> diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
> index 382eeb9be0..b1da03cafe 100644
> --- a/sysdeps/generic/ldsodefs.h
> +++ b/sysdeps/generic/ldsodefs.h
> @@ -38,6 +38,7 @@
>  #include <libc-lock.h>
>  #include <hp-timing.h>
>  #include <tls.h>
> +#include <list_t.h>
>  
>  __BEGIN_DECLS
>  
> @@ -461,15 +462,22 @@ struct rtld_global
>  
>    EXTERN void (*_dl_init_static_tls) (struct link_map *);
>  
> -  EXTERN void (*_dl_wait_lookup_done) (void);
> -
>    /* Scopes to free after next THREAD_GSCOPE_WAIT ().  */
>    EXTERN struct dl_scope_free_list
>    {
>      size_t count;
>      void *list[50];
>    } *_dl_scope_free_list;
> -#if !THREAD_GSCOPE_IN_TCB
> +#if THREAD_GSCOPE_IN_TCB
> +  /* List of active thread stacks, with memory managed by glibc.  */
> +  EXTERN list_t _dl_stack_used;
> +
> +  /* List of thread stacks that were allocated by the application.  */
> +  EXTERN list_t _dl_stack_user;
> +
> +  /* Mutex protecting the stack lists.  */
> +  EXTERN int _dl_stack_cache_lock;
> +#else
>    EXTERN int _dl_thread_gscope_count;
>  #endif
>  #ifdef SHARED
> @@ -1252,6 +1260,11 @@ link_map_audit_state (struct link_map *l, size_t index)
>  }
>  #endif /* SHARED */
>  
> +#if THREAD_GSCOPE_IN_TCB
> +void __thread_gscope_wait (void) attribute_hidden;
> +# define THREAD_GSCOPE_WAIT() __thread_gscope_wait ()
> +#endif
> +
>  __END_DECLS
>  
>  #endif /* ldsodefs.h */

Ok.

> diff --git a/sysdeps/hppa/nptl/tls.h b/sysdeps/hppa/nptl/tls.h
> index 2315340735..f319cb42e2 100644
> --- a/sysdeps/hppa/nptl/tls.h
> +++ b/sysdeps/hppa/nptl/tls.h
> @@ -154,8 +154,6 @@ static inline void __set_cr27(struct pthread *cr27)
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* !__ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/i386/nptl/tls.h b/sysdeps/i386/nptl/tls.h
> index 5042d52b98..b74347bacd 100644
> --- a/sysdeps/i386/nptl/tls.h
> +++ b/sysdeps/i386/nptl/tls.h
> @@ -387,8 +387,6 @@ tls_fill_user_desc (union user_desc_init *desc,
>    while (0)
>  #define THREAD_GSCOPE_SET_FLAG() \
>    THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/ia64/nptl/tls.h b/sysdeps/ia64/nptl/tls.h
> index f5b1684ef3..42b082dad6 100644
> --- a/sysdeps/ia64/nptl/tls.h
> +++ b/sysdeps/ia64/nptl/tls.h
> @@ -175,8 +175,6 @@ register struct pthread *__thread_self __asm__("r13");
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/m68k/nptl/tls.h b/sysdeps/m68k/nptl/tls.h
> index 68ea952e79..69e174484e 100644
> --- a/sysdeps/m68k/nptl/tls.h
> +++ b/sysdeps/m68k/nptl/tls.h
> @@ -153,8 +153,6 @@ extern void * __m68k_read_tp (void);
>        atomic_write_barrier ();						\
>      }									\
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/microblaze/nptl/tls.h b/sysdeps/microblaze/nptl/tls.h
> index a094fdb798..78f63019dc 100644
> --- a/sysdeps/microblaze/nptl/tls.h
> +++ b/sysdeps/microblaze/nptl/tls.h
> @@ -136,8 +136,6 @@ typedef struct
>        atomic_write_barrier ();                                              \
>      }                                                                       \
>    while (0)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL (dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/mips/nptl/tls.h b/sysdeps/mips/nptl/tls.h
> index 8b55f19c37..7e7ac43d6c 100644
> --- a/sysdeps/mips/nptl/tls.h
> +++ b/sysdeps/mips/nptl/tls.h
> @@ -178,8 +178,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/nios2/nptl/tls.h b/sysdeps/nios2/nptl/tls.h
> index facb27c761..776fe9bb2d 100644
> --- a/sysdeps/nios2/nptl/tls.h
> +++ b/sysdeps/nios2/nptl/tls.h
> @@ -157,8 +157,6 @@ register struct pthread *__thread_self __asm__("r23");
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/nptl/dl-thread_gscope_wait.c b/sysdeps/nptl/dl-thread_gscope_wait.c
> new file mode 100644
> index 0000000000..a9bfbee24c
> --- /dev/null
> +++ b/sysdeps/nptl/dl-thread_gscope_wait.c
> @@ -0,0 +1,80 @@
> +/* Out-of-line notification function for the GSCOPE locking mechanism.
> +   Copyright (C) 2007-2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <nptl/descr.h>
> +#include <futex-internal.h>
> +#include <ldsodefs.h>
> +#include <list.h>
> +#include <lowlevellock.h>
> +
> +void
> +__thread_gscope_wait (void)
> +{
> +  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
> +
> +  struct pthread *self = THREAD_SELF;
> +
> +  /* Iterate over the list with system-allocated threads first.  */
> +  list_t *runp;
> +  list_for_each (runp, &GL (dl_stack_used))
> +    {
> +      struct pthread *t = list_entry (runp, struct pthread, list);
> +      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
> +        continue;
> +
> +      int *const gscope_flagp = &t->header.gscope_flag;
> +
> +      /* We have to wait until this thread is done with the global
> +         scope.  First tell the thread that we are waiting and
> +         possibly have to be woken.  */
> +      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
> +                                                THREAD_GSCOPE_FLAG_WAIT,
> +                                                THREAD_GSCOPE_FLAG_USED))
> +        continue;
> +
> +      do
> +        futex_wait_simple ((unsigned int *) gscope_flagp,
> +                           THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
> +      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
> +    }
> +
> +  /* Now the list with threads using user-allocated stacks.  */
> +  list_for_each (runp, &GL (dl_stack_user))
> +    {
> +      struct pthread *t = list_entry (runp, struct pthread, list);
> +      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
> +        continue;
> +
> +      int *const gscope_flagp = &t->header.gscope_flag;
> +
> +      /* We have to wait until this thread is done with the global
> +         scope.  First tell the thread that we are waiting and
> +         possibly have to be woken.  */
> +      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
> +                                                THREAD_GSCOPE_FLAG_WAIT,
> +                                                THREAD_GSCOPE_FLAG_USED))
> +        continue;
> +
> +      do
> +        futex_wait_simple ((unsigned int *) gscope_flagp,
> +                           THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
> +      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
> +    }
> +
> +  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
> +}

Ok, this is basically __wait_lookup_done.

> diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
> index b1f0b30fbb..261eecfd18 100644
> --- a/sysdeps/powerpc/nptl/tls.h
> +++ b/sysdeps/powerpc/nptl/tls.h
> @@ -245,8 +245,6 @@ register void *__thread_register __asm__ ("r13");
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/riscv/nptl/tls.h b/sysdeps/riscv/nptl/tls.h
> index a9167bc143..41d9db10cf 100644
> --- a/sysdeps/riscv/nptl/tls.h
> +++ b/sysdeps/riscv/nptl/tls.h
> @@ -139,8 +139,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/s390/nptl/tls.h b/sysdeps/s390/nptl/tls.h
> index 4a9b3570eb..7653109617 100644
> --- a/sysdeps/s390/nptl/tls.h
> +++ b/sysdeps/s390/nptl/tls.h
> @@ -185,8 +185,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/sh/nptl/tls.h b/sysdeps/sh/nptl/tls.h
> index 6ccad251f9..2d5e3731b2 100644
> --- a/sysdeps/sh/nptl/tls.h
> +++ b/sysdeps/sh/nptl/tls.h
> @@ -161,8 +161,6 @@ typedef struct
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  

Ok.

> diff --git a/sysdeps/sparc/nptl/tls.h b/sysdeps/sparc/nptl/tls.h
> index 18a548e86b..b08a156131 100644
> --- a/sysdeps/sparc/nptl/tls.h
> +++ b/sysdeps/sparc/nptl/tls.h
> @@ -158,8 +158,6 @@ register struct pthread *__thread_self __asm__("%g7");
>        atomic_write_barrier ();						     \
>      }									     \
>    while (0)
> -#define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* !ASSEMBLER */
>  

Ok.

> diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
> index fbd7f9cb89..a08bf972de 100644
> --- a/sysdeps/x86_64/nptl/tls.h
> +++ b/sysdeps/x86_64/nptl/tls.h
> @@ -332,8 +332,6 @@ _Static_assert (offsetof (tcbhead_t, __glibc_unused2) == 0x80,
>    while (0)
>  # define THREAD_GSCOPE_SET_FLAG() \
>    THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
> -# define THREAD_GSCOPE_WAIT() \
> -  GL(dl_wait_lookup_done) ()
>  
>  #endif /* __ASSEMBLER__ */
>  
> 

Ok.
Simon Marchi March 5, 2021, 4:54 p.m. UTC | #2
On 2020-11-13 10:10 a.m., Florian Weimer wrote:
> Now __thread_gscope_wait (the function behind THREAD_GSCOPE_WAIT,
> formerly __wait_lookup_done) can be implemented directly in ld.so,
> eliminating the unprotected GL (dl_wait_lookup_done) function
> pointer.

Hi Florian,

Presumably starting with this commit (I don't really know how to build a
glibc and test against it), GDB fails to attach to a threaded process
because libthread_db fails to initialize.  See:

  https://sourceware.org/bugzilla/show_bug.cgi?id=27526

The difference in behavior as seen from GDB is that libthread_db now
asks to look up a symbol "_dl_stack_user" in module NULL.  GDB can't
find this symbol, which fails the initialization.

Can you shed some light on this?  Is this request expected, and where is
GDB expected to find this symbol?

Thanks,

Simon
Florian Weimer March 5, 2021, 5:15 p.m. UTC | #3
* Simon Marchi via Libc-alpha:

> On 2020-11-13 10:10 a.m., Florian Weimer wrote:
>> Now __thread_gscope_wait (the function behind THREAD_GSCOPE_WAIT,
>> formerly __wait_lookup_done) can be implemented directly in ld.so,
>> eliminating the unprotected GL (dl_wait_lookup_done) function
>> pointer.
>
> Hi Florian,
>
> Presumably starting with this commit (I don't really know how to build a
> glibc and test against it), GDB fails to attach to a threaded process
> because libthread_db fails to initialize.  See:
>
>   https://sourceware.org/bugzilla/show_bug.cgi?id=27526
>
> The difference in behavior as seen from GDB is that libthread_db now
> asks to look up a symbol "_dl_stack_user" in module NULL.  GDB can't
> find this symbol, which fails the initialization.
>
> Can you shed some light on this?  Is this request expected, and where is
> GDB expected to find this symbol?

It is not expected.  This is the fallback path if _rtld_global cannot be
located.  The actual failure is that __td_ta_rtld_global does not
succeed.  Any suggestion how to debug this further?

Sorry, I had thought I tested this.  I guess we should add some sort of
regression testing for nptl_db, but I don't know how to tell GDB to use
the just-built nptl_db.

Thanks,
Florian
Andreas Schwab March 5, 2021, 5:26 p.m. UTC | #4
On Mär 05 2021, Florian Weimer via Libc-alpha wrote:

> Sorry, I had thought I tested this.  I guess we should add some sort of
> regression testing for nptl_db, but I don't know how to tell GDB to use
> the just-built nptl_db.

(gdb) help set libthread-db-search-path

Andreas.
Simon Marchi March 5, 2021, 5:58 p.m. UTC | #5
On 2021-03-05 12:15 p.m., Florian Weimer wrote:> * Simon Marchi via Libc-alpha:
> 
>> On 2020-11-13 10:10 a.m., Florian Weimer wrote:
>>> Now __thread_gscope_wait (the function behind THREAD_GSCOPE_WAIT,
>>> formerly __wait_lookup_done) can be implemented directly in ld.so,
>>> eliminating the unprotected GL (dl_wait_lookup_done) function
>>> pointer.
>>
>> Hi Florian,
>>
>> Presumably starting with this commit (I don't really know how to build a
>> glibc and test against it), GDB fails to attach to a threaded process
>> because libthread_db fails to initialize.  See:
>>
>>   https://sourceware.org/bugzilla/show_bug.cgi?id=27526
>>
>> The difference in behavior as seen from GDB is that libthread_db now
>> asks to look up a symbol "_dl_stack_user" in module NULL.  GDB can't
>> find this symbol, which fails the initialization.
>>
>> Can you shed some light on this?  Is this request expected, and where is
>> GDB expected to find this symbol?
> 
> It is not expected.  This is the fallback path if _rtld_global cannot be
> located.  The actual failure is that __td_ta_rtld_global does not
> succeed.

[adding gdb-patches]

Ok, thanks for that tip.  Indeed I see that GDB returns PS_NOSYM for
_rtld_global.  If I now log what GDB returns:

    LOOKUP nptl_version in libpthread.so.0
    Found 0x7fdb02713037
    LOOKUP _rtld_global in ld-linux-x86-64.so.2
    Not found
    LOOKUP _dl_stack_user in (null)
    Not found

So this lookup of _rtld_global is new too.  And I think I see the
problem, it looks like an ordering issue: libthread_db is loaded when
GDB notices the program has libpthread loaded in it.  When attaching,
GDB walks the shared library list.  In that list, libpthread comes
before ld-linux.  So at the time we try to load libthread_db, GDB hasn't
yet noticed that the program has ld-linux loaded in it, hasn't ingested
its symbols, so doesn't find _rtld_global.

For comparison, glibc 2.31 (on Ubuntu 20.04) only requested symbols in
libpthread itself, so there wasn't this ordering issue:

    LOOKUP nptl_version in libpthread.so.0
    Found 0x7f38538e9037
    LOOKUP __stack_user in libpthread.so.0
    Found 0x7f38538f3350
    LOOKUP _thread_db_list_t_next in libpthread.so.0
    Found 0x7f38538e93b0
    LOOKUP _thread_db_const_thread_area in libpthread.so.0
    Found 0x7f38538e92b4
    LOOKUP _thread_db_sizeof_pthread in libpthread.so.0
    Found 0x7f38538e92cc
    LOOKUP _thread_db_pthread_specific in libpthread.so.0
    Found 0x7f38538e9400
    LOOKUP _thread_db_pthread_schedpolicy in libpthread.so.0
    Found 0x7f38538e9420
    LOOKUP _thread_db_pthread_schedparam_sched_priority in libpthread.so.0
    Found 0x7f38538e9410
    LOOKUP _thread_db_pthread_tid in libpthread.so.0
    Found 0x7f38538e9450
    LOOKUP _thread_db_pthread_cancelhandling in libpthread.so.0
    Found 0x7f38538e9430
    LOOKUP _thread_db_pthread_report_events in libpthread.so.0
    Found 0x7f38538e9460
    LOOKUP _thread_db_pthread_start_routine in libpthread.so.0
    Found 0x7f38538e9440
    LOOKUP _thread_db_pthread_eventbuf_eventmask_event_bits in libpthread.so.0
    Found 0x7f38538e93d0

If we have to deal with this, I guess that GDB should now do things in a
different order: go through the whole library list and load their
symbols.  And then if one of those libraries were libpthread, try to
initialize libthread_db.

Simon
Florian Weimer March 5, 2021, 6:03 p.m. UTC | #6
* Simon Marchi:

> On 2021-03-05 12:15 p.m., Florian Weimer wrote:> * Simon Marchi via Libc-alpha:
>> 
>>> On 2020-11-13 10:10 a.m., Florian Weimer wrote:
>>>> Now __thread_gscope_wait (the function behind THREAD_GSCOPE_WAIT,
>>>> formerly __wait_lookup_done) can be implemented directly in ld.so,
>>>> eliminating the unprotected GL (dl_wait_lookup_done) function
>>>> pointer.
>>>
>>> Hi Florian,
>>>
>>> Presumably starting with this commit (I don't really know how to build a
>>> glibc and test against it), GDB fails to attach to a threaded process
>>> because libthread_db fails to initialize.  See:
>>>
>>>   https://sourceware.org/bugzilla/show_bug.cgi?id=27526
>>>
>>> The difference in behavior as seen from GDB is that libthread_db now
>>> asks to look up a symbol "_dl_stack_user" in module NULL.  GDB can't
>>> find this symbol, which fails the initialization.
>>>
>>> Can you shed some light on this?  Is this request expected, and where is
>>> GDB expected to find this symbol?
>> 
>> It is not expected.  This is the fallback path if _rtld_global cannot be
>> located.  The actual failure is that __td_ta_rtld_global does not
>> succeed.
>
> [adding gdb-patches]
>
> Ok, thanks for that tip.  Indeed I see that GDB returns PS_NOSYM for
> _rtld_global.  If I now log what GDB returns:
>
>     LOOKUP nptl_version in libpthread.so.0
>     Found 0x7fdb02713037
>     LOOKUP _rtld_global in ld-linux-x86-64.so.2
>     Not found
>     LOOKUP _dl_stack_user in (null)
>     Not found
>
> So this lookup of _rtld_global is new too.  And I think I see the
> problem, it looks like an ordering issue: libthread_db is loaded when
> GDB notices the program has libpthread loaded in it.  When attaching,
> GDB walks the shared library list.  In that list, libpthread comes
> before ld-linux.  So at the time we try to load libthread_db, GDB hasn't
> yet noticed that the program has ld-linux loaded in it, hasn't ingested
> its symbols, so doesn't find _rtld_global.

Oh, that reads like a plausible explanation.  And I assume the
non-attaching case, where GDB starts the process, is very different, and
this ordering issue does not appear?

> If we have to deal with this, I guess that GDB should now do things in a
> different order: go through the whole library list and load their
> symbols.  And then if one of those libraries were libpthread, try to
> initialize libthread_db.

Initialization of libthread_db should be unconditional.  Programs use
TLS data without linking against libpthread.  And glibc 2.34 might not
have a separate libpthread at all.

Thanks,
Florian
Simon Marchi March 5, 2021, 6:45 p.m. UTC | #7
On 2021-03-05 1:03 p.m., Florian Weimer wrote:
> Oh, that reads like a plausible explanation.  And I assume the
> non-attaching case, where GDB starts the process, is very different, and
> this ordering issue does not appear?

Indeed, in that case ld-linux-x86-64.so.2 is loaded before
libpthread.so.0 (ld-linux is necessarily loaded before the others, I
guess, since it's the one loading the others).  So the symbol is found:

    LOOKUP _rtld_global in ld-linux-x86-64.so.2
    Found 0x7ffff7ffd000

>> If we have to deal with this, I guess that GDB should now do things in a
>> different order: go through the whole library list and load their
>> symbols.  And then if one of those libraries were libpthread, try to
>> initialize libthread_db.
> 
> Initialization of libthread_db should be unconditional.  Programs use
> TLS data without linking against libpthread.  And glibc 2.34 might not
> have a separate libpthread at all.

Ok, currently GDB attempts to load libthread_db when noticing the main
objfile / program (I guess it is needed if the program is statically
linked to libpthread?) or when seeing a library named libpthread*.

I'm not sure how to fix this, other than making GDB attempt to load
libthread_db on every new shared library it notices, since that new
shared library may "finally" make it work.  The current code
specifically exists to avoid trying to load libthread_db for every new
shared library we notice, since that was considered wasteful.  Here's
the original thread about it:

  https://sourceware.org/pipermail/gdb-patches/2011-October/085781.html
  https://pi.simark.ca/gdb-patches/20111005182705.D744E2461D1@ruffy.mtv.corp.google.com/

About the hypothetical scenario for glibc 2.34: do you mean that the
pthread infrastructure will directly be in libc.so?  If so, our current
strategy of attempting to load libthread_db only for the main program
or a libpthread* library will indeed not work.  And I suppose that will
also require trying to load libthread_db on every new shared lib...

Simon
Florian Weimer March 5, 2021, 7 p.m. UTC | #8
* Simon Marchi:

>>> If we have to deal with this, I guess that GDB should now do things in a
>>> different order: go through the whole library list and load their
>>> symbols.  And then if one of those libraries were libpthread, try to
>>> initialize libthread_db.
>> 
>> Initialization of libthread_db should be unconditional.  Programs use
>> TLS data without linking against libpthread.  And glibc 2.34 might not
>> have a separate libpthread at all.
>
> Ok, currently GDB attempts to load libthread_db when noticing the main
> objfile / program (I guess it is needed if the program is statically
> linked to libpthread?) or when seeing a library named libpthread*.

Would it be possible to load libthread_db unconditionally after loading
all shared objects?  Then it is loaded only once.

> About the hypothetical scenario for glibc 2.34: do you mean that the
> pthread infrastructure will directly be in libc.so?  If so, our current
> strategy of attempting to load libthread_db only for the main program
> or a libpthread* library will indeed not work.  And I suppose that will
> also require trying to load libthread_db on every new shared lib...

I think one attempt loading is enough, after all shared objects are
available.  In both the attaching and starting case, libpthread will be
seen by libthread_db if it is there.  I do not think it is necessary to
try loading libpthread_db again for each dlopen.  Maybe you could
restrict that to trigger on libpthread, but then dlopen of libpthread
does not really work today.

Thanks,
Florian
Florian Weimer March 29, 2021, 8:26 a.m. UTC | #9
* Florian Weimer via Gdb-patches:

> * Simon Marchi:
>
>>>> If we have to deal with this, I guess that GDB should now do things in a
>>>> different order: go through the whole library list and load their
>>>> symbols.  And then if one of those libraries were libpthread, try to
>>>> initialize libthread_db.
>>> 
>>> Initialization of libthread_db should be unconditional.  Programs use
>>> TLS data without linking against libpthread.  And glibc 2.34 might not
>>> have a separate libpthread at all.
>>
>> Ok, currently GDB attempts to load libthread_db when noticing the main
>> objfile / program (I guess it is needed if the program is statically
>> linked to libpthread?) or when seeing a library named libpthread*.
>
> Would it be possible to load libthread_db unconditionally after loading
> all shared objects?  Then it is loaded only once.
>
>> About the hypothetical scenario for glibc 2.34: do you mean that the
>> pthread infrastructure will directly be in libc.so?  If so, our current
>> strategy of attempting to load libthread_db only for the main program
>> or a libpthread* library will indeed not work.  And I suppose that will
>> also require trying to load libthread_db on every new shared lib...
>
> I think one attempt loading is enough, after all shared objects are
> available.  In both the attaching and starting case, libpthread will be
> seen by libthread_db if it is there.  I do not think it is necessary to
> try loading libpthread_db again for each dlopen.  Maybe you could
> restrict that to trigger on libpthread, but then dlopen of libpthread
> does not really work today.

I would appreciate if we could make some progress on this issue.
Please let me know if you need glibc test builds or something in that
area.  Thanks.
Simon Marchi March 29, 2021, 2:29 p.m. UTC | #10
On 2021-03-29 4:26 a.m., Florian Weimer wrote:> * Florian Weimer via Gdb-patches:
> 
>> * Simon Marchi:
>>
>>>>> If we have to deal with this, I guess that GDB should now do things in a
>>>>> different order: go through the whole library list and load their
>>>>> symbols.  And then if one of those libraries were libpthread, try to
>>>>> initialize libthread_db.
>>>>
>>>> Initialization of libthread_db should be unconditional.  Programs use
>>>> TLS data without linking against libpthread.  And glibc 2.34 might not
>>>> have a separate libpthread at all.
>>>
>>> Ok, currently GDB attempts to load libthread_db when noticing the main
>>> objfile / program (I guess it is needed if the program is statically
>>> linked to libpthread?) or when seeing a library named libpthread*.
>>
>> Would it be possible to load libthread_db unconditionally after loading
>> all shared objects?  Then it is loaded only once.
>>
>>> About the hypothetical scenario for glibc 2.34: do you mean that the
>>> pthread infrastructure will directly be in libc.so?  If so, our current
>>> strategy of attempting to load libthread_db only for the main program
>>> or a libpthread* library will indeed not work.  And I suppose that will
>>> also require trying to load libthread_db on every new shared lib...
>>
>> I think one attempt loading is enough, after all shared objects are
>> available.  In both the attaching and starting case, libpthread will be
>> seen by libthread_db if it is there.  I do not think it is necessary to
>> try loading libpthread_db again for each dlopen.  Maybe you could
>> restrict that to trigger on libpthread, but then dlopen of libpthread
>> does not really work today.
> 
> I would appreciate if we could make some progress on this issue.
> Please let me know if you need glibc test builds or something in that
> area.  Thanks.

Hi Florian,

I'll try to look into it, but I can't promise anything as I have nearly
zero free / personal time for GDB these days.

Simon
diff mbox series

Patch

diff --git a/csu/libc-tls.c b/csu/libc-tls.c
index 06e76bd395..c3589f0a7d 100644
--- a/csu/libc-tls.c
+++ b/csu/libc-tls.c
@@ -24,6 +24,7 @@ 
 #include <stdio.h>
 #include <sys/param.h>
 #include <array_length.h>
+#include <list.h>
 
 #ifdef SHARED
  #error makefile bug, this file is for static only
@@ -193,6 +194,12 @@  __libc_setup_tls (void)
   if (__builtin_expect (lossage != NULL, 0))
     _startup_fatal (lossage);
 
+#if THREAD_GSCOPE_IN_TCB
+  INIT_LIST_HEAD (&_dl_stack_used);
+  INIT_LIST_HEAD (&_dl_stack_user);
+  list_add (&THREAD_SELF->list, &_dl_stack_user);
+#endif
+
   /* Update the executable's link map with enough information to make
      the TLS routines happy.  */
   main_map->l_tls_align = align;
diff --git a/elf/Makefile b/elf/Makefile
index f10cc59e7c..2015383eb2 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -34,7 +34,8 @@  dl-routines	= $(addprefix dl-,load lookup object reloc deps \
 				  version profile tls origin scope \
 				  execstack open close trampoline \
 				  exception sort-maps lookup-direct \
-				  call-libc-early-init write)
+				  call-libc-early-init write \
+				  thread_gscope_wait)
 ifeq (yes,$(use-ldconfig))
 dl-routines += dl-cache
 endif
@@ -54,7 +55,8 @@  endif
 all-dl-routines = $(dl-routines) $(sysdep-dl-routines)
 # But they are absent from the shared libc, because that code is in ld.so.
 elide-routines.os = $(all-dl-routines) dl-support enbl-secure dl-origin \
-		    dl-sysdep dl-exception dl-reloc-static-pie
+		    dl-sysdep dl-exception dl-reloc-static-pie \
+		    thread_gscope_wait
 
 # ld.so uses those routines, plus some special stuff for being the program
 # interpreter and operating independent of libc.
diff --git a/elf/dl-support.c b/elf/dl-support.c
index afbc94df54..15e6d787c9 100644
--- a/elf/dl-support.c
+++ b/elf/dl-support.c
@@ -183,10 +183,11 @@  ElfW(Word) _dl_stack_flags = DEFAULT_STACK_PERMS;
 int (*_dl_make_stack_executable_hook) (void **) = _dl_make_stack_executable;
 
 
-/* Function in libpthread to wait for termination of lookups.  */
-void (*_dl_wait_lookup_done) (void);
-
-#if !THREAD_GSCOPE_IN_TCB
+#if THREAD_GSCOPE_IN_TCB
+list_t _dl_stack_used;
+list_t _dl_stack_user;
+int _dl_stack_cache_lock;
+#else
 int _dl_thread_gscope_count;
 #endif
 struct dl_scope_free_list *_dl_scope_free_list;
diff --git a/elf/dl-thread_gscope_wait.c b/elf/dl-thread_gscope_wait.c
new file mode 100644
index 0000000000..28e1494730
--- /dev/null
+++ b/elf/dl-thread_gscope_wait.c
@@ -0,0 +1,2 @@ 
+/* By default, the dynamic linker does not use an out-of-line
+   __thread_gscope_wait function.  */
diff --git a/elf/rtld.c b/elf/rtld.c
index 5d117d0d2c..c4ffc8d4b7 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -48,6 +48,7 @@ 
 #include <array_length.h>
 #include <libc-early-init.h>
 #include <dl-main.h>
+#include <list.h>
 
 #include <assert.h>
 
@@ -799,6 +800,9 @@  cannot allocate TLS data structures for initial thread\n");
   const char *lossage = TLS_INIT_TP (tcbp);
   if (__glibc_unlikely (lossage != NULL))
     _dl_fatal_printf ("cannot set up thread-local storage: %s\n", lossage);
+#if THREAD_GSCOPE_IN_TCB
+  list_add (&THREAD_SELF->list, &GL (dl_stack_user));
+#endif
   tls_init_tp_called = true;
 
   return tcbp;
@@ -1139,6 +1143,11 @@  dl_main (const ElfW(Phdr) *phdr,
   GL(dl_rtld_unlock_recursive) = rtld_lock_default_unlock_recursive;
 #endif
 
+#if THREAD_GSCOPE_IN_TCB
+  INIT_LIST_HEAD (&GL (dl_stack_used));
+  INIT_LIST_HEAD (&GL (dl_stack_user));
+#endif
+
   /* The explicit initialization here is cheaper than processing the reloc
      in the _rtld_local definition's initializer.  */
   GL(dl_make_stack_executable_hook) = &_dl_make_stack_executable;
@@ -2383,6 +2392,9 @@  dl_main (const ElfW(Phdr) *phdr,
       if (__glibc_unlikely (lossage != NULL))
 	_dl_fatal_printf ("cannot set up thread-local storage: %s\n",
 			  lossage);
+#if THREAD_GSCOPE_IN_TCB
+      list_add (&THREAD_SELF->list, &GL (dl_stack_user));
+#endif
     }
 
   /* Make sure no new search directories have been added.  */
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 4b45f8c884..b7f9eeebf6 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -106,26 +106,14 @@ 
 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 static size_t stack_cache_actsize;
 
-/* Mutex protecting this variable.  */
-static int stack_cache_lock = LLL_LOCK_INITIALIZER;
-
 /* List of queued stack frames.  */
 static LIST_HEAD (stack_cache);
 
-/* List of the stacks in use.  */
-static LIST_HEAD (stack_used);
-
 /* We need to record what list operations we are going to do so that,
    in case of an asynchronous interruption due to a fork() call, we
    can correct for the work.  */
 static uintptr_t in_flight_stack;
 
-/* List of the threads with user provided stacks in use.  No need to
-   initialize this, since it's done in __pthread_initialize_minimal.  */
-list_t __stack_user __attribute__ ((nocommon));
-hidden_data_def (__stack_user)
-
-
 /* Check whether the stack is still used or not.  */
 #define FREE_P(descr) ((descr)->tid <= 0)
 
@@ -173,7 +161,7 @@  get_cached_stack (size_t *sizep, void **memp)
   struct pthread *result = NULL;
   list_t *entry;
 
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   /* Search the cache for a matching entry.  We search for the
      smallest stack which has at least the required size.  Note that
@@ -206,7 +194,7 @@  get_cached_stack (size_t *sizep, void **memp)
       || __builtin_expect (result->stackblock_size > 4 * size, 0))
     {
       /* Release the lock.  */
-      lll_unlock (stack_cache_lock, LLL_PRIVATE);
+      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
       return NULL;
     }
@@ -218,13 +206,13 @@  get_cached_stack (size_t *sizep, void **memp)
   stack_list_del (&result->list);
 
   /* And add to the list of stacks in use.  */
-  stack_list_add (&result->list, &stack_used);
+  stack_list_add (&result->list, &GL (dl_stack_used));
 
   /* And decrease the cache size.  */
   stack_cache_actsize -= result->stackblock_size;
 
   /* Release the lock early.  */
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   /* Report size and location of the stack to the caller.  */
   *sizep = result->stackblock_size;
@@ -510,12 +498,12 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 
 
       /* Prepare to modify global data.  */
-      lll_lock (stack_cache_lock, LLL_PRIVATE);
+      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
       /* And add to the list of stacks in use.  */
-      list_add (&pd->list, &__stack_user);
+      list_add (&pd->list, &GL (dl_stack_user));
 
-      lll_unlock (stack_cache_lock, LLL_PRIVATE);
+      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
     }
   else
     {
@@ -644,12 +632,12 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 
 
 	  /* Prepare to modify global data.  */
-	  lll_lock (stack_cache_lock, LLL_PRIVATE);
+	  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
 	  /* And add to the list of stacks in use.  */
-	  stack_list_add (&pd->list, &stack_used);
+	  stack_list_add (&pd->list, &GL (dl_stack_used));
 
-	  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+	  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
 
 	  /* There might have been a race.  Another thread might have
@@ -690,12 +678,12 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 	  if (__mprotect (guard, guardsize, PROT_NONE) != 0)
 	    {
 	    mprot_error:
-	      lll_lock (stack_cache_lock, LLL_PRIVATE);
+	      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
 	      /* Remove the thread from the list.  */
 	      stack_list_del (&pd->list);
 
-	      lll_unlock (stack_cache_lock, LLL_PRIVATE);
+	      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
 	      /* Get rid of the TLS block we allocated.  */
 	      _dl_deallocate_tls (TLS_TPADJ (pd), false);
@@ -799,7 +787,7 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 void
 __deallocate_stack (struct pthread *pd)
 {
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   /* Remove the thread from the list of threads with user defined
      stacks.  */
@@ -815,7 +803,7 @@  __deallocate_stack (struct pthread *pd)
     /* Free the memory associated with the ELF TLS.  */
     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 }
 
 
@@ -831,10 +819,10 @@  __make_stacks_executable (void **stack_endp)
   const size_t pagemask = ~(__getpagesize () - 1);
 #endif
 
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   list_t *runp;
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     {
       err = change_stack_perm (list_entry (runp, struct pthread, list)
 #ifdef NEED_SEPARATE_REGISTER_STACK
@@ -860,7 +848,7 @@  __make_stacks_executable (void **stack_endp)
 	  break;
       }
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   return err;
 }
@@ -891,8 +879,8 @@  __reclaim_stacks (void)
 	     pointers at the head of the list are inconsistent.  */
 	  list_t *l = NULL;
 
-	  if (stack_used.next->prev != &stack_used)
-	    l = &stack_used;
+	  if (GL (dl_stack_used).next->prev != &GL (dl_stack_used))
+	    l = &GL (dl_stack_used);
 	  else if (stack_cache.next->prev != &stack_cache)
 	    l = &stack_cache;
 
@@ -914,7 +902,7 @@  __reclaim_stacks (void)
 
   /* Mark all stacks except the still running one as free.  */
   list_t *runp;
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     {
       struct pthread *curp = list_entry (runp, struct pthread, list);
       if (curp != self)
@@ -948,7 +936,7 @@  __reclaim_stacks (void)
     }
 
   /* Add the stack of all running threads to the cache.  */
-  list_splice (&stack_used, &stack_cache);
+  list_splice (&GL (dl_stack_used), &stack_cache);
 
   /* Remove the entry for the current thread to from the cache list
      and add it to the list of running threads.  Which of the two
@@ -956,13 +944,13 @@  __reclaim_stacks (void)
   stack_list_del (&self->list);
 
   /* Re-initialize the lists for all the threads.  */
-  INIT_LIST_HEAD (&stack_used);
-  INIT_LIST_HEAD (&__stack_user);
+  INIT_LIST_HEAD (&GL (dl_stack_used));
+  INIT_LIST_HEAD (&GL (dl_stack_user));
 
   if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
-    list_add (&self->list, &__stack_user);
+    list_add (&self->list, &GL (dl_stack_user));
   else
-    list_add (&self->list, &stack_used);
+    list_add (&self->list, &GL (dl_stack_used));
 
   /* There is one thread running.  */
   __nptl_nthreads = 1;
@@ -970,7 +958,7 @@  __reclaim_stacks (void)
   in_flight_stack = 0;
 
   /* Initialize locks.  */
-  stack_cache_lock = LLL_LOCK_INITIALIZER;
+  GL (dl_stack_cache_lock) = LLL_LOCK_INITIALIZER;
   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 }
 
@@ -1083,7 +1071,7 @@  __nptl_setxid (struct xid_command *cmdp)
 {
   int signalled;
   int result;
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   __xidcmd = cmdp;
   cmdp->cntr = 0;
@@ -1093,7 +1081,7 @@  __nptl_setxid (struct xid_command *cmdp)
 
   /* Iterate over the list with system-allocated threads first.  */
   list_t *runp;
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     {
       struct pthread *t = list_entry (runp, struct pthread, list);
       if (t == self)
@@ -1103,7 +1091,7 @@  __nptl_setxid (struct xid_command *cmdp)
     }
 
   /* Now the list with threads using user-allocated stacks.  */
-  list_for_each (runp, &__stack_user)
+  list_for_each (runp, &GL (dl_stack_user))
     {
       struct pthread *t = list_entry (runp, struct pthread, list);
       if (t == self)
@@ -1119,7 +1107,7 @@  __nptl_setxid (struct xid_command *cmdp)
     {
       signalled = 0;
 
-      list_for_each (runp, &stack_used)
+      list_for_each (runp, &GL (dl_stack_used))
 	{
 	  struct pthread *t = list_entry (runp, struct pthread, list);
 	  if (t == self)
@@ -1128,7 +1116,7 @@  __nptl_setxid (struct xid_command *cmdp)
 	  signalled += setxid_signal_thread (cmdp, t);
 	}
 
-      list_for_each (runp, &__stack_user)
+      list_for_each (runp, &GL (dl_stack_user))
 	{
 	  struct pthread *t = list_entry (runp, struct pthread, list);
 	  if (t == self)
@@ -1149,7 +1137,7 @@  __nptl_setxid (struct xid_command *cmdp)
 
   /* Clean up flags, so that no thread blocks during exit waiting
      for a signal which will never come.  */
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     {
       struct pthread *t = list_entry (runp, struct pthread, list);
       if (t == self)
@@ -1158,7 +1146,7 @@  __nptl_setxid (struct xid_command *cmdp)
       setxid_unmark_thread (cmdp, t);
     }
 
-  list_for_each (runp, &__stack_user)
+  list_for_each (runp, &GL (dl_stack_user))
     {
       struct pthread *t = list_entry (runp, struct pthread, list);
       if (t == self)
@@ -1180,7 +1168,7 @@  __nptl_setxid (struct xid_command *cmdp)
     }
   __nptl_setxid_error (cmdp, error);
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
   return result;
 }
 
@@ -1204,75 +1192,16 @@  void
 attribute_hidden
 __pthread_init_static_tls (struct link_map *map)
 {
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   /* Iterate over the list with system-allocated threads first.  */
   list_t *runp;
-  list_for_each (runp, &stack_used)
+  list_for_each (runp, &GL (dl_stack_used))
     init_one_static_tls (list_entry (runp, struct pthread, list), map);
 
   /* Now the list with threads using user-allocated stacks.  */
-  list_for_each (runp, &__stack_user)
+  list_for_each (runp, &GL (dl_stack_user))
     init_one_static_tls (list_entry (runp, struct pthread, list), map);
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
-}
-
-
-void
-attribute_hidden
-__wait_lookup_done (void)
-{
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
-
-  struct pthread *self = THREAD_SELF;
-
-  /* Iterate over the list with system-allocated threads first.  */
-  list_t *runp;
-  list_for_each (runp, &stack_used)
-    {
-      struct pthread *t = list_entry (runp, struct pthread, list);
-      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
-	continue;
-
-      int *const gscope_flagp = &t->header.gscope_flag;
-
-      /* We have to wait until this thread is done with the global
-	 scope.  First tell the thread that we are waiting and
-	 possibly have to be woken.  */
-      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
-						THREAD_GSCOPE_FLAG_WAIT,
-						THREAD_GSCOPE_FLAG_USED))
-	continue;
-
-      do
-	futex_wait_simple ((unsigned int *) gscope_flagp,
-			   THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
-      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
-    }
-
-  /* Now the list with threads using user-allocated stacks.  */
-  list_for_each (runp, &__stack_user)
-    {
-      struct pthread *t = list_entry (runp, struct pthread, list);
-      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
-	continue;
-
-      int *const gscope_flagp = &t->header.gscope_flag;
-
-      /* We have to wait until this thread is done with the global
-	 scope.  First tell the thread that we are waiting and
-	 possibly have to be woken.  */
-      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
-						THREAD_GSCOPE_FLAG_WAIT,
-						THREAD_GSCOPE_FLAG_USED))
-	continue;
-
-      do
-	futex_wait_simple ((unsigned int *) gscope_flagp,
-			   THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
-      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
-    }
-
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 }
diff --git a/nptl/descr.h b/nptl/descr.h
index d8343ff9a1..b172ee408b 100644
--- a/nptl/descr.h
+++ b/nptl/descr.h
@@ -162,7 +162,8 @@  struct pthread
     void *__padding[24];
   };
 
-  /* This descriptor's link on the `stack_used' or `__stack_user' list.  */
+  /* This descriptor's link on the GL (dl_stack_used) or
+     GL (dl_stack_user) list.  */
   list_t list;
 
   /* Thread ID - which is also a 'is this thread descriptor (and
diff --git a/nptl/nptl-init.c b/nptl/nptl-init.c
index 4aa1231bec..53b817715d 100644
--- a/nptl/nptl-init.c
+++ b/nptl/nptl-init.c
@@ -251,12 +251,9 @@  __pthread_initialize_minimal_internal (void)
      purposes this is good enough.  */
   THREAD_SETMEM (pd, stackblock_size, (size_t) __libc_stack_end);
 
-  /* Initialize the list of all running threads with the main thread.  */
-  INIT_LIST_HEAD (&__stack_user);
-  list_add (&pd->list, &__stack_user);
-
-  /* Before initializing __stack_user, the debugger could not find us and
-     had to set __nptl_initial_report_events.  Propagate its setting.  */
+  /* Before initializing GL (dl_stack_user), the debugger could not
+     find us and had to set __nptl_initial_report_events.  Propagate
+     its setting.  */
   THREAD_SETMEM (pd, report_events, __nptl_initial_report_events);
 
   struct sigaction sa;
@@ -336,8 +333,6 @@  __pthread_initialize_minimal_internal (void)
 
   GL(dl_init_static_tls) = &__pthread_init_static_tls;
 
-  GL(dl_wait_lookup_done) = &__wait_lookup_done;
-
   /* Register the fork generation counter with the libc.  */
 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
   __libc_multiple_threads_ptr =
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index 686f54aa93..a7510f9f63 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -208,10 +208,6 @@  extern void __default_pthread_attr_freeres (void) attribute_hidden;
 extern size_t __static_tls_size attribute_hidden;
 extern size_t __static_tls_align_m1 attribute_hidden;
 
-/* Thread descriptor handling.  */
-extern list_t __stack_user;
-hidden_proto (__stack_user)
-
 /* Attribute handling.  */
 extern struct pthread_attr *__attr_list attribute_hidden;
 extern int __attr_list_lock attribute_hidden;
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 447f005ece..bad4e57a84 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -213,9 +213,9 @@  __find_in_stack_list (struct pthread *pd)
   list_t *entry;
   struct pthread *result = NULL;
 
-  lll_lock (stack_cache_lock, LLL_PRIVATE);
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
-  list_for_each (entry, &stack_used)
+  list_for_each (entry, &GL (dl_stack_used))
     {
       struct pthread *curp;
 
@@ -228,7 +228,7 @@  __find_in_stack_list (struct pthread *pd)
     }
 
   if (result == NULL)
-    list_for_each (entry, &__stack_user)
+    list_for_each (entry, &GL (dl_stack_user))
       {
 	struct pthread *curp;
 
@@ -240,7 +240,7 @@  __find_in_stack_list (struct pthread *pd)
 	  }
       }
 
-  lll_unlock (stack_cache_lock, LLL_PRIVATE);
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
 
   return result;
 }
diff --git a/nptl_db/structs.def b/nptl_db/structs.def
index 472e9008fa..2ca33598b1 100644
--- a/nptl_db/structs.def
+++ b/nptl_db/structs.def
@@ -69,8 +69,6 @@  DB_STRUCT (td_eventbuf_t)
 DB_STRUCT_FIELD (td_eventbuf_t, eventnum)
 DB_STRUCT_FIELD (td_eventbuf_t, eventdata)
 
-DB_SYMBOL (stack_used)
-DB_SYMBOL (__stack_user)
 DB_SYMBOL (nptl_version)
 DB_FUNCTION (__nptl_create_event)
 DB_FUNCTION (__nptl_death_event)
@@ -106,6 +104,8 @@  DB_STRUCT (rtld_global)
 DB_RTLD_VARIABLE (_rtld_global)
 #endif
 DB_RTLD_GLOBAL_FIELD (dl_tls_dtv_slotinfo_list)
+DB_RTLD_GLOBAL_FIELD (dl_stack_user)
+DB_RTLD_GLOBAL_FIELD (dl_stack_used)
 
 DB_STRUCT (dtv_slotinfo_list)
 DB_STRUCT_FIELD (dtv_slotinfo_list, len)
diff --git a/nptl_db/td_init.c b/nptl_db/td_init.c
index 86773dcb06..79a1f90312 100644
--- a/nptl_db/td_init.c
+++ b/nptl_db/td_init.c
@@ -29,3 +29,17 @@  td_init (void)
   LOG ("td_init");
   return TD_OK;
 }
+
+bool
+__td_ta_rtld_global (td_thragent_t *ta)
+{
+  if (ta->ta_addr__rtld_global == 0
+      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
+                        &ta->ta_addr__rtld_global) != PS_OK)
+    {
+      ta->ta_addr__rtld_global = (void*)-1;
+      return false;
+    }
+  else
+    return ta->ta_addr__rtld_global != (void*)-1;
+}
diff --git a/nptl_db/td_ta_map_lwp2thr.c b/nptl_db/td_ta_map_lwp2thr.c
index d04d4bec23..81ff48808c 100644
--- a/nptl_db/td_ta_map_lwp2thr.c
+++ b/nptl_db/td_ta_map_lwp2thr.c
@@ -187,7 +187,7 @@  td_ta_map_lwp2thr (const td_thragent_t *ta_arg,
      fake a special descriptor for the initial thread.  */
 
   psaddr_t list;
-  td_err_e err = DB_GET_SYMBOL (list, ta, __stack_user);
+  td_err_e err = __td_ta_stack_user (ta, &list);
   if (err != TD_OK)
     return err;
 
diff --git a/nptl_db/td_ta_thr_iter.c b/nptl_db/td_ta_thr_iter.c
index d59782bd6e..e406bcbd39 100644
--- a/nptl_db/td_ta_thr_iter.c
+++ b/nptl_db/td_ta_thr_iter.c
@@ -133,14 +133,14 @@  td_ta_thr_iter (const td_thragent_t *ta_arg, td_thr_iter_f *callback,
      have to iterate over both lists separately.  We start with the
      list of threads with user-defined stacks.  */
 
-  err = DB_GET_SYMBOL (list, ta, __stack_user);
+  err = __td_ta_stack_user (ta, &list);
   if (err == TD_OK)
     err = iterate_thread_list (ta, callback, cbdata_p, state, ti_pri,
 			       list, true);
 
   /* And the threads with stacks allocated by the implementation.  */
   if (err == TD_OK)
-    err = DB_GET_SYMBOL (list, ta, stack_used);
+    err = __td_ta_stack_used (ta, &list);
   if (err == TD_OK)
     err = iterate_thread_list (ta, callback, cbdata_p, state, ti_pri,
 			       list, false);
diff --git a/nptl_db/td_thr_tlsbase.c b/nptl_db/td_thr_tlsbase.c
index 8198934112..eeba0a04d8 100644
--- a/nptl_db/td_thr_tlsbase.c
+++ b/nptl_db/td_thr_tlsbase.c
@@ -28,12 +28,7 @@  dtv_slotinfo_list (td_thragent_t *ta,
   td_err_e err;
   psaddr_t head;
 
-  if (ta->ta_addr__rtld_global == 0
-      && td_mod_lookup (ta->ph, LD_SO, SYM__rtld_global,
-			&ta->ta_addr__rtld_global) != PS_OK)
-    ta->ta_addr__rtld_global = (void*)-1;
-
-  if (ta->ta_addr__rtld_global != (void*)-1)
+  if (__td_ta_rtld_global (ta))
     {
       err = DB_GET_FIELD (head, ta, ta->ta_addr__rtld_global,
 			  rtld_global, _dl_tls_dtv_slotinfo_list, 0);
diff --git a/nptl_db/td_thr_validate.c b/nptl_db/td_thr_validate.c
index c709c5e9be..d2f4107cf2 100644
--- a/nptl_db/td_thr_validate.c
+++ b/nptl_db/td_thr_validate.c
@@ -20,6 +20,41 @@ 
 #include "thread_dbP.h"
 #include <stdbool.h>
 
+td_err_e
+__td_ta_stack_user (td_thragent_t *ta, psaddr_t *plist)
+{
+  if (__td_ta_rtld_global (ta))
+    return DB_GET_FIELD_ADDRESS (*plist, ta, ta->ta_addr__rtld_global,
+				 rtld_global, _dl_stack_user, 0);
+  else
+    {
+      if (ta->ta_addr__dl_stack_user == 0
+	  && td_mod_lookup (ta->ph, NULL, SYM__dl_stack_user,
+			    &ta->ta_addr__dl_stack_user) != PS_OK)
+	return TD_ERR;
+      *plist = ta->ta_addr__dl_stack_user;
+      return TD_OK;
+    }
+}
+
+td_err_e
+__td_ta_stack_used (td_thragent_t *ta, psaddr_t *plist)
+{
+
+  if (__td_ta_rtld_global (ta))
+    return DB_GET_FIELD_ADDRESS (*plist, ta, ta->ta_addr__rtld_global,
+				 rtld_global, _dl_stack_used, 0);
+  else
+    {
+      if (ta->ta_addr__dl_stack_used == 0
+	  && td_mod_lookup (ta->ph, NULL, SYM__dl_stack_used,
+			    &ta->ta_addr__dl_stack_used) != PS_OK)
+	return TD_ERR;
+      *plist = ta->ta_addr__dl_stack_used;
+      return TD_OK;
+    }
+}
+
 static td_err_e
 check_thread_list (const td_thrhandle_t *th, psaddr_t head, bool *uninit)
 {
@@ -62,7 +97,7 @@  td_thr_validate (const td_thrhandle_t *th)
 
   /* First check the list with threads using user allocated stacks.  */
   bool uninit = false;
-  err = DB_GET_SYMBOL (list, th->th_ta_p, __stack_user);
+  err = __td_ta_stack_user (th->th_ta_p, &list);
   if (err == TD_OK)
     err = check_thread_list (th, list, &uninit);
 
@@ -70,7 +105,7 @@  td_thr_validate (const td_thrhandle_t *th)
      using implementation allocated stacks.  */
   if (err == TD_NOTHR)
     {
-      err = DB_GET_SYMBOL (list, th->th_ta_p, stack_used);
+      err = __td_ta_stack_used (th->th_ta_p, &list);
       if (err == TD_OK)
 	err = check_thread_list (th, list, &uninit);
 
diff --git a/nptl_db/thread_dbP.h b/nptl_db/thread_dbP.h
index 8a666b8d4f..9574e68d70 100644
--- a/nptl_db/thread_dbP.h
+++ b/nptl_db/thread_dbP.h
@@ -269,4 +269,17 @@  extern td_err_e _td_check_sizeof (td_thragent_t *ta, uint32_t *sizep,
 extern td_err_e __td_ta_lookup_th_unique (const td_thragent_t *ta,
 					  lwpid_t lwpid, td_thrhandle_t *th);
 
+/* Try to initialize TA->ta_addr__rtld_global.  Return true on
+   success, false on failure (which may be cached).  */
+bool __td_ta_rtld_global (td_thragent_t *ta) attribute_hidden;
+
+/* Obtain the address of the list_t fields _dl_stack_user and
+   _dl_stack_used in _rtld_global, or fall back to the global
+   variables of the same name (to support statically linked
+   programs).  */
+td_err_e __td_ta_stack_user (td_thragent_t *ta, psaddr_t *plist)
+  attribute_hidden;
+td_err_e __td_ta_stack_used (td_thragent_t *ta, psaddr_t *plist)
+  attribute_hidden;
+
 #endif /* thread_dbP.h */
diff --git a/sysdeps/aarch64/nptl/tls.h b/sysdeps/aarch64/nptl/tls.h
index e5627f777e..dbef4704f1 100644
--- a/sysdeps/aarch64/nptl/tls.h
+++ b/sysdeps/aarch64/nptl/tls.h
@@ -129,8 +129,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-# define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 # endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/alpha/nptl/tls.h b/sysdeps/alpha/nptl/tls.h
index 82549607fd..f3101f57c9 100644
--- a/sysdeps/alpha/nptl/tls.h
+++ b/sysdeps/alpha/nptl/tls.h
@@ -123,8 +123,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #else /* __ASSEMBLER__ */
 # include <tcb-offsets.h>
diff --git a/sysdeps/arc/nptl/tls.h b/sysdeps/arc/nptl/tls.h
index 184b550ab5..cc5abb1931 100644
--- a/sysdeps/arc/nptl/tls.h
+++ b/sysdeps/arc/nptl/tls.h
@@ -131,8 +131,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* !__ASSEMBLER__ */
 
diff --git a/sysdeps/arm/nptl/tls.h b/sysdeps/arm/nptl/tls.h
index 24a488ff37..91dd7498e4 100644
--- a/sysdeps/arm/nptl/tls.h
+++ b/sysdeps/arm/nptl/tls.h
@@ -120,8 +120,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/csky/nptl/tls.h b/sysdeps/csky/nptl/tls.h
index bcca9674a1..f308773d40 100644
--- a/sysdeps/csky/nptl/tls.h
+++ b/sysdeps/csky/nptl/tls.h
@@ -147,8 +147,6 @@  typedef struct
       atomic_write_barrier ();						      \
     }									      \
   while (0)
-# define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 382eeb9be0..b1da03cafe 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -38,6 +38,7 @@ 
 #include <libc-lock.h>
 #include <hp-timing.h>
 #include <tls.h>
+#include <list_t.h>
 
 __BEGIN_DECLS
 
@@ -461,15 +462,22 @@  struct rtld_global
 
   EXTERN void (*_dl_init_static_tls) (struct link_map *);
 
-  EXTERN void (*_dl_wait_lookup_done) (void);
-
   /* Scopes to free after next THREAD_GSCOPE_WAIT ().  */
   EXTERN struct dl_scope_free_list
   {
     size_t count;
     void *list[50];
   } *_dl_scope_free_list;
-#if !THREAD_GSCOPE_IN_TCB
+#if THREAD_GSCOPE_IN_TCB
+  /* List of active thread stacks, with memory managed by glibc.  */
+  EXTERN list_t _dl_stack_used;
+
+  /* List of thread stacks that were allocated by the application.  */
+  EXTERN list_t _dl_stack_user;
+
+  /* Mutex protecting the stack lists.  */
+  EXTERN int _dl_stack_cache_lock;
+#else
   EXTERN int _dl_thread_gscope_count;
 #endif
 #ifdef SHARED
@@ -1252,6 +1260,11 @@  link_map_audit_state (struct link_map *l, size_t index)
 }
 #endif /* SHARED */
 
+#if THREAD_GSCOPE_IN_TCB
+void __thread_gscope_wait (void) attribute_hidden;
+# define THREAD_GSCOPE_WAIT() __thread_gscope_wait ()
+#endif
+
 __END_DECLS
 
 #endif /* ldsodefs.h */
diff --git a/sysdeps/hppa/nptl/tls.h b/sysdeps/hppa/nptl/tls.h
index 2315340735..f319cb42e2 100644
--- a/sysdeps/hppa/nptl/tls.h
+++ b/sysdeps/hppa/nptl/tls.h
@@ -154,8 +154,6 @@  static inline void __set_cr27(struct pthread *cr27)
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* !__ASSEMBLER__ */
 
diff --git a/sysdeps/i386/nptl/tls.h b/sysdeps/i386/nptl/tls.h
index 5042d52b98..b74347bacd 100644
--- a/sysdeps/i386/nptl/tls.h
+++ b/sysdeps/i386/nptl/tls.h
@@ -387,8 +387,6 @@  tls_fill_user_desc (union user_desc_init *desc,
   while (0)
 #define THREAD_GSCOPE_SET_FLAG() \
   THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/ia64/nptl/tls.h b/sysdeps/ia64/nptl/tls.h
index f5b1684ef3..42b082dad6 100644
--- a/sysdeps/ia64/nptl/tls.h
+++ b/sysdeps/ia64/nptl/tls.h
@@ -175,8 +175,6 @@  register struct pthread *__thread_self __asm__("r13");
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/m68k/nptl/tls.h b/sysdeps/m68k/nptl/tls.h
index 68ea952e79..69e174484e 100644
--- a/sysdeps/m68k/nptl/tls.h
+++ b/sysdeps/m68k/nptl/tls.h
@@ -153,8 +153,6 @@  extern void * __m68k_read_tp (void);
       atomic_write_barrier ();						\
     }									\
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/microblaze/nptl/tls.h b/sysdeps/microblaze/nptl/tls.h
index a094fdb798..78f63019dc 100644
--- a/sysdeps/microblaze/nptl/tls.h
+++ b/sysdeps/microblaze/nptl/tls.h
@@ -136,8 +136,6 @@  typedef struct
       atomic_write_barrier ();                                              \
     }                                                                       \
   while (0)
-# define THREAD_GSCOPE_WAIT() \
-  GL (dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/mips/nptl/tls.h b/sysdeps/mips/nptl/tls.h
index 8b55f19c37..7e7ac43d6c 100644
--- a/sysdeps/mips/nptl/tls.h
+++ b/sysdeps/mips/nptl/tls.h
@@ -178,8 +178,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/nios2/nptl/tls.h b/sysdeps/nios2/nptl/tls.h
index facb27c761..776fe9bb2d 100644
--- a/sysdeps/nios2/nptl/tls.h
+++ b/sysdeps/nios2/nptl/tls.h
@@ -157,8 +157,6 @@  register struct pthread *__thread_self __asm__("r23");
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/nptl/dl-thread_gscope_wait.c b/sysdeps/nptl/dl-thread_gscope_wait.c
new file mode 100644
index 0000000000..a9bfbee24c
--- /dev/null
+++ b/sysdeps/nptl/dl-thread_gscope_wait.c
@@ -0,0 +1,80 @@ 
+/* Out-of-line notification function for the GSCOPE locking mechanism.
+   Copyright (C) 2007-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <nptl/descr.h>
+#include <futex-internal.h>
+#include <ldsodefs.h>
+#include <list.h>
+#include <lowlevellock.h>
+
+void
+__thread_gscope_wait (void)
+{
+  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
+
+  struct pthread *self = THREAD_SELF;
+
+  /* Iterate over the list with system-allocated threads first.  */
+  list_t *runp;
+  list_for_each (runp, &GL (dl_stack_used))
+    {
+      struct pthread *t = list_entry (runp, struct pthread, list);
+      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
+        continue;
+
+      int *const gscope_flagp = &t->header.gscope_flag;
+
+      /* We have to wait until this thread is done with the global
+         scope.  First tell the thread that we are waiting and
+         possibly have to be woken.  */
+      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
+                                                THREAD_GSCOPE_FLAG_WAIT,
+                                                THREAD_GSCOPE_FLAG_USED))
+        continue;
+
+      do
+        futex_wait_simple ((unsigned int *) gscope_flagp,
+                           THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
+      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
+    }
+
+  /* Now the list with threads using user-allocated stacks.  */
+  list_for_each (runp, &GL (dl_stack_user))
+    {
+      struct pthread *t = list_entry (runp, struct pthread, list);
+      if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
+        continue;
+
+      int *const gscope_flagp = &t->header.gscope_flag;
+
+      /* We have to wait until this thread is done with the global
+         scope.  First tell the thread that we are waiting and
+         possibly have to be woken.  */
+      if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
+                                                THREAD_GSCOPE_FLAG_WAIT,
+                                                THREAD_GSCOPE_FLAG_USED))
+        continue;
+
+      do
+        futex_wait_simple ((unsigned int *) gscope_flagp,
+                           THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
+      while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
+    }
+
+  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
+}
diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
index b1f0b30fbb..261eecfd18 100644
--- a/sysdeps/powerpc/nptl/tls.h
+++ b/sysdeps/powerpc/nptl/tls.h
@@ -245,8 +245,6 @@  register void *__thread_register __asm__ ("r13");
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/riscv/nptl/tls.h b/sysdeps/riscv/nptl/tls.h
index a9167bc143..41d9db10cf 100644
--- a/sysdeps/riscv/nptl/tls.h
+++ b/sysdeps/riscv/nptl/tls.h
@@ -139,8 +139,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-# define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/s390/nptl/tls.h b/sysdeps/s390/nptl/tls.h
index 4a9b3570eb..7653109617 100644
--- a/sysdeps/s390/nptl/tls.h
+++ b/sysdeps/s390/nptl/tls.h
@@ -185,8 +185,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/sh/nptl/tls.h b/sysdeps/sh/nptl/tls.h
index 6ccad251f9..2d5e3731b2 100644
--- a/sysdeps/sh/nptl/tls.h
+++ b/sysdeps/sh/nptl/tls.h
@@ -161,8 +161,6 @@  typedef struct
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */
 
diff --git a/sysdeps/sparc/nptl/tls.h b/sysdeps/sparc/nptl/tls.h
index 18a548e86b..b08a156131 100644
--- a/sysdeps/sparc/nptl/tls.h
+++ b/sysdeps/sparc/nptl/tls.h
@@ -158,8 +158,6 @@  register struct pthread *__thread_self __asm__("%g7");
       atomic_write_barrier ();						     \
     }									     \
   while (0)
-#define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* !ASSEMBLER */
 
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index fbd7f9cb89..a08bf972de 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -332,8 +332,6 @@  _Static_assert (offsetof (tcbhead_t, __glibc_unused2) == 0x80,
   while (0)
 # define THREAD_GSCOPE_SET_FLAG() \
   THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED)
-# define THREAD_GSCOPE_WAIT() \
-  GL(dl_wait_lookup_done) ()
 
 #endif /* __ASSEMBLER__ */