diff mbox

[PR18457] Don't require rtld lock to compute DTV addr for static TLS

Message ID orvbf5ffyt.fsf@livre.home
State New
Headers show

Commit Message

Alexandre Oliva June 3, 2015, 6:44 a.m. UTC
We used to store static TLS addrs in the DTV at module load time, but
this required one thread to modify another thread's DTV.  Now that we
defer the DTV update to the first use in the thread, we should do so
without taking the rtld lock if the module is already assigned to static
TLS.  Taking the lock introduces deadlocks where there weren't any
before.

This patch fixes the deadlock caused by tls_get_addr's unnecessarily
taking the rtld lock to initialize the DTV entry for tls_dtor_list
within __call_dtors_list, which deadlocks with a dlclose of a module
whose finalizer joins with that thread.  The patch does not, however,
attempt to fix other potential sources of similar deadlocks, such as
the explicit rtld locks taken by call_dtors_list, when the dtor list
is not empty; lazy relocation of the reference to tls_dtor_list, when
TLS Descriptors are in use; when tls dtors call functions through the
PLT and lazy relocation needs to be performed, or any such called
functions interact with the dynamic loader in ways that require its
lock to be taken.

Ok to install?


for  ChangeLog

	[PR dynamic-link/18457]
	* elf/dl-tls.c (tls_get_addr_tail): Don't take the rtld lock
	if we already have a final static TLS offset.
	* nptl/tst-join7.c, nptl/tst-join7mod.c: New.
---
 NEWS                |    2 +-
 elf/dl-tls.c        |   38 ++++++++++++++++++++++++++------------
 nptl/Makefile       |   10 ++++++++--
 nptl/tst-join7.c    |   12 ++++++++++++
 nptl/tst-join7mod.c |   29 +++++++++++++++++++++++++++++
 5 files changed, 76 insertions(+), 15 deletions(-)
 create mode 100644 nptl/tst-join7.c
 create mode 100644 nptl/tst-join7mod.c

Comments

Torvald Riegel June 3, 2015, 10:19 a.m. UTC | #1
On Wed, 2015-06-03 at 03:44 -0300, Alexandre Oliva wrote:
> We used to store static TLS addrs in the DTV at module load time, but
> this required one thread to modify another thread's DTV.  Now that we
> defer the DTV update to the first use in the thread, we should do so
> without taking the rtld lock if the module is already assigned to static
> TLS.  Taking the lock introduces deadlocks where there weren't any
> before.
> 
> This patch fixes the deadlock caused by tls_get_addr's unnecessarily
> taking the rtld lock to initialize the DTV entry for tls_dtor_list
> within __call_dtors_list, which deadlocks with a dlclose of a module
> whose finalizer joins with that thread.  The patch does not, however,
> attempt to fix other potential sources of similar deadlocks, such as
> the explicit rtld locks taken by call_dtors_list, when the dtor list
> is not empty; lazy relocation of the reference to tls_dtor_list, when
> TLS Descriptors are in use; when tls dtors call functions through the
> PLT and lazy relocation needs to be performed, or any such called
> functions interact with the dynamic loader in ways that require its
> lock to be taken.
> 
> Ok to install?

It would be good to have the synchronization for TLS, in particular how
we implement it and which lock protects accesses to which data, be
documented somewhere.  I didn't see a pointer to such documentation,
does it exist?  If not, and given that you seem to be familiar with it,
it would be good if you could add it.

> diff --git a/elf/dl-tls.c b/elf/dl-tls.c
> index 20c7e33..726f1ea 100644
> --- a/elf/dl-tls.c
> +++ b/elf/dl-tls.c
> @@ -755,30 +755,44 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
>        the_map = listp->slotinfo[idx].map;
>      }
>  
> -  /* Make sure that, if a dlopen running in parallel forces the
> -     variable into static storage, we'll wait until the address in the
> -     static TLS block is set up, and use that.  If we're undecided
> -     yet, make sure we make the decision holding the lock as well.  */
> -  if (__glibc_unlikely (the_map->l_tls_offset
> -			!= FORCED_DYNAMIC_TLS_OFFSET))
> +  /* If the TLS block for the map is already assigned to dynamic or to
> +     static TLS, avoid the lock.  Be careful to use the same value for
> +     both tests; if we reloaded it, the second test might mistake
> +     forced dynamic for an offset.  Now, if the decision hasn't been
> +     made, take the rtld lock, so that an ongoing dlopen gets a chance
> +     to complete, and then retest; if the decision is still pending,
> +     force the module to dynamic TLS.  */
> +  ptrdiff_t offset = atomic_load_relaxed (&the_map->l_tls_offset);

This should document why relaxed MO is sufficient, or briefly refer to
existing documentation that does so.  For example, if we do not rely on
anything that the thread providing the value did before setting the
value.

> +  if (__glibc_unlikely (offset != FORCED_DYNAMIC_TLS_OFFSET))
>      {
> +      if (__glibc_unlikely (offset != NO_TLS_OFFSET))
> +	goto static_tls;
>        __rtld_lock_lock_recursive (GL(dl_load_lock));

Why do we need the lock at all?  Are we protecting just access to
l_tls_offset or to other memory locations as well?  If it's just
l_tls_offset, we could use a CAS and avoid the lock altogether.

> -      if (__glibc_likely (the_map->l_tls_offset == NO_TLS_OFFSET))
> +      offset = the_map->l_tls_offset;

l_tls_offset is effectively concurrently accessed data, so this must use
atomic accesses.  Otherwise, you're telling the compiler that there's no
concurrent access (because you promise to give it a program free of data
races), and allow it to reload or speculatively write a value to this
memory location, for example.  Likewise for the store below.

> +      if (__glibc_likely (offset == NO_TLS_OFFSET))
>  	{
>  	  the_map->l_tls_offset = FORCED_DYNAMIC_TLS_OFFSET;

Must be an atomic access.

>  	  __rtld_lock_unlock_recursive (GL(dl_load_lock));
>  	}
> -      else if (__glibc_likely (the_map->l_tls_offset
> -			       != FORCED_DYNAMIC_TLS_OFFSET))
> +      else if (__glibc_likely (offset != FORCED_DYNAMIC_TLS_OFFSET))
>  	{
> +	  /* The decision is made, and it is final.  We use the value
> +	     we've already loaded, but we could even load the offset
> +	     after releasing the lock, since it won't change.  Should
> +	     the module be released while another thread references
> +	     one of its TLS variables, that's undefined behavior.  */
> +	  __rtld_lock_unlock_recursive (GL(dl_load_lock));
> +
> +	static_tls:
> +	  ;
> +
>  #if TLS_TCB_AT_TP
> -	  void *p = (char *) THREAD_SELF - the_map->l_tls_offset;
> +	  void *p = (char *) THREAD_SELF - offset;
>  #elif TLS_DTV_AT_TP
> -	  void *p = (char *) THREAD_SELF + the_map->l_tls_offset + TLS_PRE_TCB_SIZE;
> +	  void *p = (char *) THREAD_SELF + offset + TLS_PRE_TCB_SIZE;
>  #else
>  # error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
>  #endif
> -	  __rtld_lock_unlock_recursive (GL(dl_load_lock));
>  
>  	  dtv[GET_ADDR_MODULE].pointer.is_static = true;
>  	  dtv[GET_ADDR_MODULE].pointer.val = p;
> diff --git a/nptl/Makefile b/nptl/Makefile
> index 3dd2944..4ffd13c 100644
> --- a/nptl/Makefile
> +++ b/nptl/Makefile
> @@ -242,7 +242,7 @@ tests = tst-typesizes \
>  	tst-basic7 \
>  	tst-kill1 tst-kill2 tst-kill3 tst-kill4 tst-kill5 tst-kill6 \
>  	tst-raise1 \
> -	tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 \
> +	tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 tst-join7 \
>  	tst-detach1 \
>  	tst-eintr1 tst-eintr2 tst-eintr3 tst-eintr4 tst-eintr5 \
>  	tst-tsd1 tst-tsd2 tst-tsd3 tst-tsd4 tst-tsd5 tst-tsd6 \
> @@ -320,7 +320,8 @@ endif
>  modules-names = tst-atfork2mod tst-tls3mod tst-tls4moda tst-tls4modb \
>  		tst-tls5mod tst-tls5moda tst-tls5modb tst-tls5modc \
>  		tst-tls5modd tst-tls5mode tst-tls5modf tst-stack4mod \
> -		tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod
> +		tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod \
> +		tst-join7mod
>  extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) tst-cleanup4aux.o
>  test-extras += $(modules-names) tst-cleanup4aux
>  test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names)))
> @@ -525,6 +526,11 @@ $(objpfx)tst-tls6.out: tst-tls6.sh $(objpfx)tst-tls5 \
>  	$(evaluate-test)
>  endif
>  
> +$(objpfx)tst-join7: $(libdl) $(shared-thread-library)
> +$(objpfx)tst-join7.out: $(objpfx)tst-join7mod.so
> +$(objpfx)tst-join7mod.so: $(shared-thread-library)
> +LDFLAGS-tst-join7mod.so = -Wl,-soname,tst-join7mod.so
> +
>  $(objpfx)tst-dlsym1: $(libdl) $(shared-thread-library)
>  
>  $(objpfx)tst-fini1: $(shared-thread-library) $(objpfx)tst-fini1mod.so
> diff --git a/nptl/tst-join7.c b/nptl/tst-join7.c
> new file mode 100644
> index 0000000..bf6fc76
> --- /dev/null
> +++ b/nptl/tst-join7.c
> @@ -0,0 +1,12 @@
> +#include <dlfcn.h>
> +
> +int
> +do_test (void)
> +{
> +  void *f = dlopen ("tst-join7mod.so", RTLD_NOW | RTLD_GLOBAL);
> +  if (f) dlclose (f); else return 1;
> +  return 0;
> +}
> +
> +#define TEST_FUNCTION do_test ()
> +#include "../test-skeleton.c"
> diff --git a/nptl/tst-join7mod.c b/nptl/tst-join7mod.c
> new file mode 100644
> index 0000000..a8c7bc0
> --- /dev/null
> +++ b/nptl/tst-join7mod.c
> @@ -0,0 +1,29 @@
> +#include <stdio.h>
> +#include <pthread.h>
> +
> +static pthread_t th;
> +static int running = 1;
> +
> +static void *
> +test_run (void *p)
> +{
> +  while (running)

That's a data race with the modification do_end, or isn't it?  Use
atomics, a semaphore (with trywait), or trylock.

> +    fprintf (stderr, "XXX test_run\n");
> +  fprintf (stderr, "XXX test_run FINISHED\n");
> +  return NULL;
> +}
> +
> +static void __attribute__ ((constructor))
> +do_init (void)
> +{
> +  pthread_create (&th, NULL, test_run, NULL);
> +}
> +
> +static void __attribute__ ((destructor))
> +do_end (void)
> +{
> +  running = 0;
> +  fprintf (stderr, "thread_join...\n");
> +  pthread_join (th, NULL);
> +  fprintf (stderr, "thread_join DONE\n");
> +}
> 
>
Alexandre Oliva June 3, 2015, 1:20 p.m. UTC | #2
On Jun  3, 2015, Torvald Riegel <triegel@redhat.com> wrote:

> It would be good to have the synchronization for TLS, in particular how
> we implement it and which lock protects accesses to which data, be
> documented somewhere.  I didn't see a pointer to such documentation,
> does it exist?

I very much doubt it.

> If not, and given that you seem to be familiar with it, it would be
> good if you could add it.

Surely not as part of this regression fix, but yeah, that would be nice.
I'll talk to my manager about such a large project, but I'm not very
hopeful, considering I'm back in GCC for the foreseeable future.

>> -  /* Make sure that, if a dlopen running in parallel forces the
>> -     variable into static storage, we'll wait until the address in the
>> -     static TLS block is set up, and use that.  If we're undecided
>> -     yet, make sure we make the decision holding the lock as well.  */
>> -  if (__glibc_unlikely (the_map->l_tls_offset
>> -			!= FORCED_DYNAMIC_TLS_OFFSET))
>> +  /* If the TLS block for the map is already assigned to dynamic or to
>> +     static TLS, avoid the lock.  Be careful to use the same value for
>> +     both tests; if we reloaded it, the second test might mistake
>> +     forced dynamic for an offset.  Now, if the decision hasn't been
>> +     made, take the rtld lock, so that an ongoing dlopen gets a chance
>> +     to complete, and then retest; if the decision is still pending,
>> +     force the module to dynamic TLS.  */
>> +  ptrdiff_t offset = atomic_load_relaxed (&the_map->l_tls_offset);

> This should document why relaxed MO is sufficient

The comments I added attempted to do so.  Can you please try to indicate
what appears to be missing in them?

As for why we need relaxed MO, the first question is why we need atomics
to begin with.  We don't really need anything special there, we just
need to make sure we perform the newly-added test using the same value
used in the preexisting unguarded test.  I was tempted to load the value
into a var and use the asm("":"+X"(var)) idiom, to make sure the var is
not reloaded, but I figured a relaxed atomic load would express that
more nicely.

Now, if you were to ask me for formal proof that relaxed is enough, I
would have to tell you it probably isn't.  Although the memory used for
l_tls_offset in a link map will necessarily be initialized by a thread
holding the load lock, if that memory had been used for anything else
before, another thread might still access a previous value without an
acquire, and if what it finds there happens to be
FORCED_DYNAMIC_TLS_OFFSET, we'd get unexpected behavior.  That said, we
seem to have got along fine with a non-atomic load, so making it a
relaxed atomic load doesn't make it any worse.

This is just one out of many examples of how the current TLS
implementation makes assumptions about the underlying memory model that
are not documented and that are hardly guaranteed by the language or by
the broad variation of target architectures.  It's like we're driving,
blind, at high speed, on a busy freeway, facing the wrong way.  Yet
somehow it at least appears to work, and I guess most of us are too
scared to touch it in any significant way ;-)

I've already mentioned other fragile aspects in the way we deal with
slotinfo, in how DTV growth is AS-Unsafe, in how dynamic TLS Descriptors
are using volatile where atomics would have been a better choice (they
didn't exist back then :-), and the list of suspicious behavior related
with TLS just keeps growing.  I like your goal of getting it documented
and adjusted to current standards, but given our differences in
terminology and background, I very much doubt any such docs coming from
me will meet your expectations.

Perhaps the best way to go about this project is to make it Q&As: you
ask about the properties of something you're trying to document or clean
up, I (or someone else) answer them, then, after any further
clarifications, you turn that into documentation and code changes.
Would you like to do that?


>> __rtld_lock_lock_recursive (GL(dl_load_lock));

> Why do we need the lock at all?

It's right there in the comments, both before and after the change,
though I tried to make the reason more immediately apparent.  Hint:
concurrent dlopen.

> Are we protecting just access to
> l_tls_offset or to other memory locations as well?

We must wait for dlopen to complete.  It might assign the module to
static TLS if some IE relocation requires it.

>> -      if (__glibc_likely (the_map->l_tls_offset == NO_TLS_OFFSET))
>> +      offset = the_map->l_tls_offset;

> l_tls_offset is effectively concurrently accessed data,

> Likewise for the store below.

Yeah, but these are guarded by the lock.

Sure, to be absolutely safe, we want all loads and stores to be atomic
(current code assumes aligned word accesses are relaxed atomic, with
acquire and release implied by the locks), but I'm not inclined to
revisit all that just to fix this regression.  A minimal change is
better for that.


>> +static int running = 1;
>> +  while (running)

> That's a data race with the modification do_end, or isn't it?  Use
> atomics, a semaphore (with trywait), or trylock.

Andreas, any preferences as to how to adjust your (?) testcase to meet
Torvald's standards?

/me takes a mental note to acknowledge the testcase author in the
ChangeLog.

Sorry I didn't do that last night.
Torvald Riegel June 3, 2015, 3:13 p.m. UTC | #3
On Wed, 2015-06-03 at 10:20 -0300, Alexandre Oliva wrote:
> On Jun  3, 2015, Torvald Riegel <triegel@redhat.com> wrote:
> >> -  /* Make sure that, if a dlopen running in parallel forces the
> >> -     variable into static storage, we'll wait until the address in the
> >> -     static TLS block is set up, and use that.  If we're undecided
> >> -     yet, make sure we make the decision holding the lock as well.  */
> >> -  if (__glibc_unlikely (the_map->l_tls_offset
> >> -			!= FORCED_DYNAMIC_TLS_OFFSET))
> >> +  /* If the TLS block for the map is already assigned to dynamic or to
> >> +     static TLS, avoid the lock.  Be careful to use the same value for
> >> +     both tests; if we reloaded it, the second test might mistake
> >> +     forced dynamic for an offset.  Now, if the decision hasn't been
> >> +     made, take the rtld lock, so that an ongoing dlopen gets a chance
> >> +     to complete, and then retest; if the decision is still pending,
> >> +     force the module to dynamic TLS.  */
> >> +  ptrdiff_t offset = atomic_load_relaxed (&the_map->l_tls_offset);
> 
> > This should document why relaxed MO is sufficient
> 
> The comments I added attempted to do so.  Can you please try to indicate
> what appears to be missing in them?
> 
> As for why we need relaxed MO, the first question is why we need atomics
> to begin with.  We don't really need anything special there, we just
> need to make sure we perform the newly-added test using the same value
> used in the preexisting unguarded test.

So you think a reload by the compiler would be bad.  This can only be
bad if there is concurrent modification, potentially; either in a signal
handler by the same thread (ie, reentrant), or by another thread.  Thus,
therefore we need the atomic access; using a plain load would be a data
race, and we don't do that in new code or when fixing code.

> I was tempted to load the value
> into a var and use the asm("":"+X"(var)) idiom, to make sure the var is
> not reloaded, but I figured a relaxed atomic load would express that
> more nicely.

Yes.  This is concurrency, and we have atomic accesses for it :)

> Now, if you were to ask me for formal proof that relaxed is enough,

I'm not asking for a formal proof, I'm looking for an explanation that
clearly tells others looking at this code in the future why this is
supposed to work.  Relaxed accesses do work in some cases, but in many
others they aren't sufficient; therefore, we want to document why they
are sufficient in the particular case.

> I
> would have to tell you it probably isn't.  Although the memory used for
> l_tls_offset in a link map will necessarily be initialized by a thread
> holding the load lock, if that memory had been used for anything else
> before, another thread might still access a previous value without an
> acquire, and if what it finds there happens to be
> FORCED_DYNAMIC_TLS_OFFSET, we'd get unexpected behavior.

AFAIU, you seem to speak about memory reuse unrelated to this
specifically this particular load, right?  If that's a concern, let's
open a bug or such for this.  But that sounds like an issue
independently of whether the specific load is an acquire or relaxed
load.  Or do I misunderstand you?

We established before that you want to prevent reload because there are
concurrent stores.  Are these by other threads?

If so, are there any cases of the following pattern:

storing thread;
  A;
  atomic_store_relaxed(&l_tls_offset, ...);

observing thread:
  offset = atomic_load_relaxed(&l_tls_offset);
  B(offset);

where something in B (which uses or has a dependency on offset) relies
on happening after A?

If there are such cases, we need a release store and acquire load,
unless we can get around that for other reasons.  If there are none,
this is a reason why a relaxed MO load can be sufficient.
Ideally, there would be documentation of what the major happens-before
orderings are that the TLS code relies on, and you could just point to
those and say that those don't apply here.  In absence of that, I
suggest just briefly describing why such dependencies on other things
happening before do not exist.  For example, if the memory location
pretty much exists on its own (logically) and there are no dependencies
to other stuff (e.g., a statistics counter), then relaxed is fine.

> That said, we
> seem to have got along fine with a non-atomic load, so making it a
> relaxed atomic load doesn't make it any worse.

It won't -- but if you're fixing it anyway, it won't hurt to do it right
or at least leave a code comment that something may need to be fixed or
reviewed in the future (e.g., "/* XXX Don't know wehther relaxed MO is
sufficient.  */").  You must have been thinking about it to decide you
don't want the reload -- it would be good if could finish that.

> This is just one out of many examples of how the current TLS
> implementation makes assumptions about the underlying memory model that
> are not documented and that are hardly guaranteed by the language or by
> the broad variation of target architectures.  It's like we're driving,
> blind, at high speed, on a busy freeway, facing the wrong way.  Yet
> somehow it at least appears to work, and I guess most of us are too
> scared to touch it in any significant way ;-)
> 
> I've already mentioned other fragile aspects in the way we deal with
> slotinfo, in how DTV growth is AS-Unsafe, in how dynamic TLS Descriptors
> are using volatile where atomics would have been a better choice (they
> didn't exist back then :-), and the list of suspicious behavior related
> with TLS just keeps growing.  I like your goal of getting it documented
> and adjusted to current standards, but given our differences in
> terminology and background, I very much doubt any such docs coming from
> me will meet your expectations.
> 
> Perhaps the best way to go about this project is to make it Q&As: you
> ask about the properties of something you're trying to document or clean
> up, I (or someone else) answer them, then, after any further
> clarifications, you turn that into documentation and code changes.
> Would you like to do that?

I'd need to find the time to do that first :)  I also believe that many
people in the project should be familiar with how we do synchronization.
With that in mind, it would seem efficient if people already familiar in
a certain area within glibc would update the synchronization in this
area.

Much of it is knowing how synchronization is *meant* to work --
implementing it properly is to a large extent not more than following
some rules how to do it and document it.

In a way, we're doing the Q&A already:  :)
I'm trying to find out what you know about the intent behind the TLS
synchronization, and I'm trying to work with you on putting this into
words that are precise and on top of the memory model we want to use.

> 
> >> __rtld_lock_lock_recursive (GL(dl_load_lock));
> 
> > Why do we need the lock at all?
> 
> It's right there in the comments, both before and after the change,
> though I tried to make the reason more immediately apparent.  Hint:
> concurrent dlopen.
> 
> > Are we protecting just access to
> > l_tls_offset or to other memory locations as well?
> 
> We must wait for dlopen to complete.  It might assign the module to
> static TLS if some IE relocation requires it.

(I'm asking because I'm trying to find out whether there are those B
depends on A dependencies I mentioned above, so that we know what to
document...)

Does dlopen just have to decide about this value (and is this value
pretty much "free-standing"), or does it have to do other initialization
or other effects logically related to this value (ie, is there an A that
matters for B)?  If not, we can say that in our documentation.

If it's really just about waiting for dlopen because dlopen absolutely
has make the decision, than the lock seems right (and we have our
documentation).

If only one of dlopen or we here have to decide about the value, then
using a CAS both here and dlopen could work as well.

> >> -      if (__glibc_likely (the_map->l_tls_offset == NO_TLS_OFFSET))
> >> +      offset = the_map->l_tls_offset;
> 
> > l_tls_offset is effectively concurrently accessed data,
> 
> > Likewise for the store below.
> 
> Yeah, but these are guarded by the lock.

But there are concurrent (atomic) loads to it, right?  Therefore, this
would be a data race, and we don't do those in new code or in code that
we change.

> Sure, to be absolutely safe, we want all loads and stores to be atomic
> (current code assumes aligned word accesses are relaxed atomic, with
> acquire and release implied by the locks), but I'm not inclined to
> revisit all that just to fix this regression.  A minimal change is
> better for that.

I disagree.  You added an atomic load on the consumer side (rightly
so!), so you should not ignore the producer side either.  This is in the
same function, and you touch most of the lines around it, and it's
confusing if you make a partial change.  Doing The Right Thing here is
not a lot of overhead.  Splitting this into a separate patch seems to be
more work than just making the change in your patch.

> 
> >> +static int running = 1;
> >> +  while (running)
> 
> > That's a data race with the modification do_end, or isn't it?  Use
> > atomics, a semaphore (with trywait), or trylock.
> 
> Andreas, any preferences as to how to adjust your (?) testcase to meet
> Torvald's standards?

Let me point out that we do have consensus in the project that new code
must be free of data races.
Andreas Schwab June 4, 2015, 4:36 p.m. UTC | #4
Alexandre Oliva <aoliva@redhat.com> writes:

>>> +static int running = 1;
>>> +  while (running)
>
>> That's a data race with the modification do_end, or isn't it?

There is no way this can go wrong.

Andreas.
Carlos O'Donell June 4, 2015, 4:41 p.m. UTC | #5
On 06/04/2015 12:36 PM, Andreas Schwab wrote:
> Alexandre Oliva <aoliva@redhat.com> writes:
> 
>>>> +static int running = 1;
>>>> +  while (running)
>>
>>> That's a data race with the modification do_end, or isn't it?
> 
> There is no way this can go wrong.

That doesn't matter, it's still conceptually wrong and continues
to promote the idea that concurrency doesn't matter because the
hardware will just fix it (tm).

The whole point of writing data-race-free code is to get people
to think about concurrency and using the right constructs to
write concurrent code. That way when we have better tooling to
detect serious data races we don't have to fix all of these issues.

Cheers,
Carlos.
Andreas Schwab June 4, 2015, 4:47 p.m. UTC | #6
"Carlos O'Donell" <carlos@redhat.com> writes:

> On 06/04/2015 12:36 PM, Andreas Schwab wrote:
>> Alexandre Oliva <aoliva@redhat.com> writes:
>> 
>>>>> +static int running = 1;
>>>>> +  while (running)
>>>
>>>> That's a data race with the modification do_end, or isn't it?
>> 
>> There is no way this can go wrong.
>
> That doesn't matter,

It is completely irrelevant for the bug.  Just remove it.

Andreas.
Carlos O'Donell June 4, 2015, 4:49 p.m. UTC | #7
> diff --git a/nptl/tst-join7.c b/nptl/tst-join7.c
> new file mode 100644
> index 0000000..bf6fc76
> --- /dev/null
> +++ b/nptl/tst-join7.c
> @@ -0,0 +1,12 @@
> +#include <dlfcn.h>

I have commented downstream on the patch, but I just wanted to
note that all test cases need FSF header, and a single line
explaining what the test does.

Cheers,
Carlos.
Torvald Riegel June 4, 2015, 8:37 p.m. UTC | #8
On Thu, 2015-06-04 at 18:36 +0200, Andreas Schwab wrote:
> Alexandre Oliva <aoliva@redhat.com> writes:
> 
> >>> +static int running = 1;
> >>> +  while (running)
> >
> >> That's a data race with the modification do_end, or isn't it?
> 
> There is no way this can go wrong.

Of course there is.

"running" is static, and it's easy for the compiler to see that it
doesn't escape in this program (ie, it's not visible outside of this
TU).  Thus, the printf in the while loop will never modify it.

You promised the compiler a data-race-free program.  Thus, you promised
that "running" won't ever change during the execution of "test_run" (ie,
no other thread and no other TU will ever do it).  Therefore, the
compiler can hoist the load of "running" outside of the loop:

register = running;
while (register)
  printf("...");

And your program won't terminate as a result.  The compiler can even do
the following transformation, figuring out that the register content
doesn't change:

if (running)
  while (1)
    printf("...");


As another data point, think about why we have "atomic_forced_read" in
include/atomic.h.  All it does is to actually do read a variable.  Why
would we need this if code like in your test can't go wrong?
diff mbox

Patch

diff --git a/NEWS b/NEWS
index ed02de0..eac100c 100644
--- a/NEWS
+++ b/NEWS
@@ -19,7 +19,7 @@  Version 2.22
   18047, 18049, 18068, 18080, 18093, 18100, 18104, 18110, 18111, 18116,
   18125, 18128, 18138, 18185, 18196, 18197, 18206, 18210, 18211, 18217,
   18220, 18221, 18234, 18244, 18247, 18287, 18319, 18333, 18346, 18397,
-  18409, 18410, 18412, 18418, 18422, 18434, 18444, 18469.
+  18409, 18410, 18412, 18418, 18422, 18434, 18444, 18457, 18469.
 
 * Cache information can be queried via sysconf() function on s390 e.g. with
   _SC_LEVEL1_ICACHE_SIZE as argument.
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 20c7e33..726f1ea 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -755,30 +755,44 @@  tls_get_addr_tail (GET_ADDR_ARGS, dtv_t *dtv, struct link_map *the_map)
       the_map = listp->slotinfo[idx].map;
     }
 
-  /* Make sure that, if a dlopen running in parallel forces the
-     variable into static storage, we'll wait until the address in the
-     static TLS block is set up, and use that.  If we're undecided
-     yet, make sure we make the decision holding the lock as well.  */
-  if (__glibc_unlikely (the_map->l_tls_offset
-			!= FORCED_DYNAMIC_TLS_OFFSET))
+  /* If the TLS block for the map is already assigned to dynamic or to
+     static TLS, avoid the lock.  Be careful to use the same value for
+     both tests; if we reloaded it, the second test might mistake
+     forced dynamic for an offset.  Now, if the decision hasn't been
+     made, take the rtld lock, so that an ongoing dlopen gets a chance
+     to complete, and then retest; if the decision is still pending,
+     force the module to dynamic TLS.  */
+  ptrdiff_t offset = atomic_load_relaxed (&the_map->l_tls_offset);
+  if (__glibc_unlikely (offset != FORCED_DYNAMIC_TLS_OFFSET))
     {
+      if (__glibc_unlikely (offset != NO_TLS_OFFSET))
+	goto static_tls;
       __rtld_lock_lock_recursive (GL(dl_load_lock));
-      if (__glibc_likely (the_map->l_tls_offset == NO_TLS_OFFSET))
+      offset = the_map->l_tls_offset;
+      if (__glibc_likely (offset == NO_TLS_OFFSET))
 	{
 	  the_map->l_tls_offset = FORCED_DYNAMIC_TLS_OFFSET;
 	  __rtld_lock_unlock_recursive (GL(dl_load_lock));
 	}
-      else if (__glibc_likely (the_map->l_tls_offset
-			       != FORCED_DYNAMIC_TLS_OFFSET))
+      else if (__glibc_likely (offset != FORCED_DYNAMIC_TLS_OFFSET))
 	{
+	  /* The decision is made, and it is final.  We use the value
+	     we've already loaded, but we could even load the offset
+	     after releasing the lock, since it won't change.  Should
+	     the module be released while another thread references
+	     one of its TLS variables, that's undefined behavior.  */
+	  __rtld_lock_unlock_recursive (GL(dl_load_lock));
+
+	static_tls:
+	  ;
+
 #if TLS_TCB_AT_TP
-	  void *p = (char *) THREAD_SELF - the_map->l_tls_offset;
+	  void *p = (char *) THREAD_SELF - offset;
 #elif TLS_DTV_AT_TP
-	  void *p = (char *) THREAD_SELF + the_map->l_tls_offset + TLS_PRE_TCB_SIZE;
+	  void *p = (char *) THREAD_SELF + offset + TLS_PRE_TCB_SIZE;
 #else
 # error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
 #endif
-	  __rtld_lock_unlock_recursive (GL(dl_load_lock));
 
 	  dtv[GET_ADDR_MODULE].pointer.is_static = true;
 	  dtv[GET_ADDR_MODULE].pointer.val = p;
diff --git a/nptl/Makefile b/nptl/Makefile
index 3dd2944..4ffd13c 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -242,7 +242,7 @@  tests = tst-typesizes \
 	tst-basic7 \
 	tst-kill1 tst-kill2 tst-kill3 tst-kill4 tst-kill5 tst-kill6 \
 	tst-raise1 \
-	tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 \
+	tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 tst-join7 \
 	tst-detach1 \
 	tst-eintr1 tst-eintr2 tst-eintr3 tst-eintr4 tst-eintr5 \
 	tst-tsd1 tst-tsd2 tst-tsd3 tst-tsd4 tst-tsd5 tst-tsd6 \
@@ -320,7 +320,8 @@  endif
 modules-names = tst-atfork2mod tst-tls3mod tst-tls4moda tst-tls4modb \
 		tst-tls5mod tst-tls5moda tst-tls5modb tst-tls5modc \
 		tst-tls5modd tst-tls5mode tst-tls5modf tst-stack4mod \
-		tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod
+		tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod \
+		tst-join7mod
 extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) tst-cleanup4aux.o
 test-extras += $(modules-names) tst-cleanup4aux
 test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names)))
@@ -525,6 +526,11 @@  $(objpfx)tst-tls6.out: tst-tls6.sh $(objpfx)tst-tls5 \
 	$(evaluate-test)
 endif
 
+$(objpfx)tst-join7: $(libdl) $(shared-thread-library)
+$(objpfx)tst-join7.out: $(objpfx)tst-join7mod.so
+$(objpfx)tst-join7mod.so: $(shared-thread-library)
+LDFLAGS-tst-join7mod.so = -Wl,-soname,tst-join7mod.so
+
 $(objpfx)tst-dlsym1: $(libdl) $(shared-thread-library)
 
 $(objpfx)tst-fini1: $(shared-thread-library) $(objpfx)tst-fini1mod.so
diff --git a/nptl/tst-join7.c b/nptl/tst-join7.c
new file mode 100644
index 0000000..bf6fc76
--- /dev/null
+++ b/nptl/tst-join7.c
@@ -0,0 +1,12 @@ 
+#include <dlfcn.h>
+
+int
+do_test (void)
+{
+  void *f = dlopen ("tst-join7mod.so", RTLD_NOW | RTLD_GLOBAL);
+  if (f) dlclose (f); else return 1;
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/nptl/tst-join7mod.c b/nptl/tst-join7mod.c
new file mode 100644
index 0000000..a8c7bc0
--- /dev/null
+++ b/nptl/tst-join7mod.c
@@ -0,0 +1,29 @@ 
+#include <stdio.h>
+#include <pthread.h>
+
+static pthread_t th;
+static int running = 1;
+
+static void *
+test_run (void *p)
+{
+  while (running)
+    fprintf (stderr, "XXX test_run\n");
+  fprintf (stderr, "XXX test_run FINISHED\n");
+  return NULL;
+}
+
+static void __attribute__ ((constructor))
+do_init (void)
+{
+  pthread_create (&th, NULL, test_run, NULL);
+}
+
+static void __attribute__ ((destructor))
+do_end (void)
+{
+  running = 0;
+  fprintf (stderr, "thread_join...\n");
+  pthread_join (th, NULL);
+  fprintf (stderr, "thread_join DONE\n");
+}