diff mbox series

eliminate mutex in fast path of __register_frame

Message ID 17b916af-e010-1e86-0e44-67799439e466@in.tum.de
State New
Headers show
Series eliminate mutex in fast path of __register_frame | expand

Commit Message

Thomas Neumann March 1, 2022, 9:58 p.m. UTC
The __register_frame/__deregister_frame functions are used to register
unwinding frames from JITed code in a sorted list. That list itself
is protected by object_mutex, which leads to terrible performance
in multi-threaded code and is somewhat expensive even if single-threaded.
There was already a fast-path that avoided taking the mutex if no
frame was registered at all.

This commit eliminates both the mutex and the sorted list from
the atomic fast path, and replaces it with a btree that uses
optimistic lock coupling during lookup. This allows for fully parallel
unwinding and is essential to scale exception handling to large
core counts.

libgcc/ChangeLog:

         * unwind-dw2-fde.c (release_registered_frames): Cleanup at shutdown.
         (__register_frame_info_table_bases): Use btree in atomic fast path.
         (__deregister_frame_info_bases): Likewise.
         (_Unwind_Find_FDE): Likewise.
         (base_from_object): Make parameter const.
         (get_pc_range_from_fdes): Compute PC range for lookup.
         (get_pc_range): Likewise.
         * unwind-dw2-fde.h (last_fde): Make parameter const.
         * unwind-dw2-btree.h: New file.
---
  libgcc/unwind-dw2-btree.h | 870 ++++++++++++++++++++++++++++++++++++++
  libgcc/unwind-dw2-fde.c   | 187 ++++++--
  libgcc/unwind-dw2-fde.h   |   2 +-
  3 files changed, 1023 insertions(+), 36 deletions(-)
  create mode 100644 libgcc/unwind-dw2-btree.h

Comments

Florian Weimer March 2, 2022, 2:08 p.m. UTC | #1
* Thomas Neumann:

> +#ifndef HIDE_EXPORTS
> +#pragma GCC visibility push(default)
> +#endif

All defined functions are static, right?  Then this isn't necessary.

> +// Common logic for version locks
> +struct version_lock
> +{
> +  // The lock itself
> +  uintptr_t version_lock;
> +};

version_lock must not overflow, right?  This means we need a wider
counter on 32-bit, too.  glibc contains a 62-bit counter that it uses
for its own data structure.

> +// Lock the node exclusive, blocking as needed
> +static void
> +version_lock_lock_exclusive (struct version_lock *vl)
> +{
> +  // We should virtually never get contention here, as frame
> +  // changes are rare. Thus we use a simple spinlock

Isn't this problematic if there are multiple compiler threads that race
to register their output with the unwinder?

If updates are rare, is per-node locking really needed?

What can we do to make this async-signal-safe, so that a call into the
unwinder from a signal handler does not spin endlessly if the receiving
thread is currently registering or deregistering a frame?  Is simply
calling sigprocmask around the register and unregister operations
enough?  (These operations don't need to be async-signal-safe, only
lookup should be.)  This can be a future change if we feel confident
that it's possible to rectify this later if needed.

> +// Validate a previously acquire lock
> +static inline bool
> +version_lock_validate (const struct version_lock *vl, uintptr_t lock)
> +{
> +  // Check that the node is still in the same state
> +  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
> +  return (state == lock);
> +}

I think an acquire fence is missing before the __atomic_load_n.  We
learned this the hard way in the glibc implementation.  The reference
that Szabolcs found is:

     Hans Boehm, Can Seqlocks Get Along with Programming Language
     Memory Models?, Section 4.

> +static void
> +btree_release_tree_recursively (struct btree *t, struct btree_node *n);
> +
> +// Destroy a tree and release all nodes. Not used currently, but could be called
> +// at shutdown to destroy the frame lookup
> +static void
> +btree_destroy (struct btree *t)
> +{

The comment seems to be incorrect, it is used?  Otherwise there should
be a compiler warning.

> +// Allocate a node. This node will be returned in locked exclusive state
> +static struct btree_node *
> +btree_allocate_node (struct btree *t, bool inner)
> +{

> +      // No free page available, allocate a new one
> +      struct btree_node *new_node
> +	= (struct btree_node *) (malloc (sizeof (struct btree_node)));
> +      version_lock_initialize_locked_exclusive (
> +	&(new_node->version_lock)); // initialize the node in locked state
> +      new_node->entry_count = 0;
> +      new_node->type = inner ? btree_node_inner : btree_node_leaf;
> +      return new_node;
> +    }
> +}

This needs some form of error checking for malloc.  But I see the
existing code does not have that, either. 8-(

> +// Find the corresponding entry the given address
> +static struct object *
> +btree_lookup (const struct btree *t, uintptr_t target_addr)
> +{

> +      if (type == btree_node_inner)
> +	{
> +	  // We cannot call find_inner_slot here because we can only trust our
> +	  // validated entries
> +	  unsigned slot = 0;
> +	  while (((slot + 1) < entry_count)
> +		 && (iter->content.children[slot].separator < target_addr))
> +	    ++slot;
> +	  struct btree_node *child = iter->content.children[slot].child;
> +	  if (!btree_node_validate (iter, lock))
> +	    goto restart;

I don't understand the comment about not using find_inner_slot.

I think if you use separate free lists for inner nodes and leaf nodes,
you never need to structurally modify the data.  Then you can avoid
validating on every level (with proper relaxed MO everywhere; right now
GCC might turn some of the array copies into memcpy, which may use
weaker-than-relaxed MO on some targets and actually produce
out-of-thin-air values).

> diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
> index 8ee55be5675..56be596713b 100644
> --- a/libgcc/unwind-dw2-fde.c
> +++ b/libgcc/unwind-dw2-fde.c
> @@ -42,15 +42,34 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>   #endif
>   #endif
>   
> +#ifdef ATOMIC_FDE_FAST_PATH
> +#include "unwind-dw2-btree.h"
> +
> +static struct btree registered_frames;
> +
> +static void
> +release_registered_frames (void) __attribute__ ((destructor (110)));
> +static void
> +release_registered_frames (void)
> +{
> +  /* Release the b-tree and all frames. Frame releases that happen later are
> +   * silently ignored */
> +  btree_destroy (&registered_frames);
> +}

Is the comment correct?  Won't a subsequent frame release necessarily
involve a use-after-free bug?  btree_destroy is only safe to call from a
quiesced process.  Deallocating these data structure is mainly relevant
for mtrace and valgrind --show-reachable=yes.  We could teach them to
call a new freeres hook in libgcc_s.

> +#ifdef ATOMIC_FDE_FAST_PATH
> +/* Get the PC range from FDEs */
> +static void
> +get_pc_range_from_fdes (const struct object *ob, const fde *this_fde,
> +			uintptr_t *range)
> +{

Would it be possible to share the bulk of the implementation with
classify_object_over_fdes?

>   /* A linear search through a set of FDEs for the given PC.  This is
>      used when there was insufficient memory to allocate and sort an
>      array.  */
> @@ -1033,17 +1154,12 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
>     const fde *f = NULL;
>   
>   #ifdef ATOMIC_FDE_FAST_PATH
> -  /* For targets where unwind info is usually not registered through these
> -     APIs anymore, avoid taking a global lock.
> -     Use relaxed MO here, it is up to the app to ensure that the library
> -     loading/initialization happens-before using that library in other
> -     threads (in particular unwinding with that library's functions
> -     appearing in the backtraces).  Calling that library's functions
> -     without waiting for the library to initialize would be racy.  */
> -  if (__builtin_expect (!__atomic_load_n (&any_objects_registered,
> -					  __ATOMIC_RELAXED), 1))
> +  ob = btree_lookup (&registered_frames, (uintptr_t) pc);
> +  if (!ob)
>       return NULL;
> -#endif
> +
> +  f = search_object (ob, pc);
> +#else

I think we should keep any_objects_registered, to avoid the SEQ_CST
fences in btree_lookup.  There might be some interest from the
maintainers of weakly ordered architectures to move away from SEQ_CST to
something else, but I assume that could be a follow-up change.

I don't know much about B-trees, so I can't really tell if the
implementation is correct.  I don't see any obvious problems, though.

I'm not a GCC reviewer.  If the project wants to adopt the patch, we'll
need to fix a few minor style issues.  But let's wait for the GCC
developers first.

Thanks,
Florian
Thomas Neumann March 2, 2022, 3:02 p.m. UTC | #2
>> +// Common logic for version locks
>> +struct version_lock
>> +{
>> +  // The lock itself
>> +  uintptr_t version_lock;
>> +};
> 
> version_lock must not overflow, right?  This means we need a wider
> counter on 32-bit, too.  glibc contains a 62-bit counter that it uses
> for its own data structure.

an overflow is not a problem per se, it is only problematic if we hit 
exactly the same value again between lock_optimistic and validate. Note 
that these ranges are usually a handful of assembler instructions, and 
we would have to see 4 billion frame registrations in that short time 
span. I don't think that is a problem in practice. But I can switch to 
64bit, of course, if you think the risk is too high.

> 
>> +// Lock the node exclusive, blocking as needed
>> +static void
>> +version_lock_lock_exclusive (struct version_lock *vl)
>> +{
>> +  // We should virtually never get contention here, as frame
>> +  // changes are rare. Thus we use a simple spinlock
> 
> Isn't this problematic if there are multiple compiler threads that race
> to register their output with the unwinder?
> 
> If updates are rare, is per-node locking really needed?
> 
> What can we do to make this async-signal-safe, so that a call into the
> unwinder from a signal handler does not spin endlessly if the receiving
> thread is currently registering or deregistering a frame?  Is simply
> calling sigprocmask around the register and unregister operations
> enough?  (These operations don't need to be async-signal-safe, only
> lookup should be.)  This can be a future change if we feel confident
> that it's possible to rectify this later if needed.

we need the locking bit because a concurrent unwinder must recognize 
that the node is currently being modified. But we could eliminate the 
spinlock aspect by using a global mutex, which would guarantee us that 
nothing is locked exclusively and thus no spinning is required. That 
would also allow us to fix the async-signal-safety at the same time if 
needed by blocking signals globally during updates.

Note that the current code is not async-signal-safe either, it simply 
grabs a mutex. If a signal happens while calling __register_frame, and 
that handler tries to unwind, the current code will deadlock, too.

> 
>> +// Validate a previously acquire lock
>> +static inline bool
>> +version_lock_validate (const struct version_lock *vl, uintptr_t lock)
>> +{
>> +  // Check that the node is still in the same state
>> +  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
>> +  return (state == lock);
>> +}
> 
> I think an acquire fence is missing before the __atomic_load_n.  We
> learned this the hard way in the glibc implementation.  The reference
> that Szabolcs found is:
> 
>       Hans Boehm, Can Seqlocks Get Along with Programming Language
>       Memory Models?, Section 4.

thanks for the pointer, I will look at this more carefully. When I read 
the text correctly we need a __atomic_thread_fence(__ATOMIC_ACQUIRE) 
before the load, but I will double check that.


>> +static void
>> +btree_release_tree_recursively (struct btree *t, struct btree_node *n);
>> +
>> +// Destroy a tree and release all nodes. Not used currently, but could be called
>> +// at shutdown to destroy the frame lookup
>> +static void
>> +btree_destroy (struct btree *t)
>> +{
> 
> The comment seems to be incorrect, it is used?  Otherwise there should
> be a compiler warning.
> 

sorry for that, I initially wanted to keep the tree alive forever, but 
then decided to destroy it at shutdown. I will update the comment.

>> +// Allocate a node. This node will be returned in locked exclusive state
>> +static struct btree_node *
>> +btree_allocate_node (struct btree *t, bool inner)
>> +{
> 
>> +      // No free page available, allocate a new one
>> +      struct btree_node *new_node
>> +	= (struct btree_node *) (malloc (sizeof (struct btree_node)));
>> +      version_lock_initialize_locked_exclusive (
>> +	&(new_node->version_lock)); // initialize the node in locked state
>> +      new_node->entry_count = 0;
>> +      new_node->type = inner ? btree_node_inner : btree_node_leaf;
>> +      return new_node;
>> +    }
>> +}
> 
> This needs some form of error checking for malloc.  But I see the
> existing code does not have that, either. 8-(

and I do not see how we can really handle a malloc failure here. What 
should we do except die? And, as you pointed out, that is fundamental in 
the surrounding code, too, it allocates an struct object via malloc, and 
will crash if that fails.


>> +// Find the corresponding entry the given address
>> +static struct object *
>> +btree_lookup (const struct btree *t, uintptr_t target_addr)
>> +{
> 
>> +      if (type == btree_node_inner)
>> +	{
>> +	  // We cannot call find_inner_slot here because we can only trust our
>> +	  // validated entries
>> +	  unsigned slot = 0;
>> +	  while (((slot + 1) < entry_count)
>> +		 && (iter->content.children[slot].separator < target_addr))
>> +	    ++slot;
>> +	  struct btree_node *child = iter->content.children[slot].child;
>> +	  if (!btree_node_validate (iter, lock))
>> +	    goto restart;
> 
> I don't understand the comment about not using find_inner_slot.
> 
> I think if you use separate free lists for inner nodes and leaf nodes,
> you never need to structurally modify the data.  Then you can avoid
> validating on every level (with proper relaxed MO everywhere; right now
> GCC might turn some of the array copies into memcpy, which may use
> weaker-than-relaxed MO on some targets and actually produce
> out-of-thin-air values).

the problem is that, at least conceptually, we can read arbitrary 
garbage while a concurrent change is ongoing. In particular the
n->element_case within find_inner_slot might return anything, including 
torn writes etc. The btree_lookup defends against that by using only 
validated values as loop limits, but the regular code does not. And 
instead of uglyfying the regular lookup_*_slot functions I preferred to 
duplicate that code with additional validate checks instead, that are 
only 3 lines anyway.


>> +
>> +static void
>> +release_registered_frames (void) __attribute__ ((destructor (110)));
>> +static void
>> +release_registered_frames (void)
>> +{
>> +  /* Release the b-tree and all frames. Frame releases that happen later are
>> +   * silently ignored */
>> +  btree_destroy (&registered_frames);
>> +}
> 
> Is the comment correct?  Won't a subsequent frame release necessarily
> involve a use-after-free bug?  btree_destroy is only safe to call from a
> quiesced process.  Deallocating these data structure is mainly relevant
> for mtrace and valgrind --show-reachable=yes.  We could teach them to
> call a new freeres hook in libgcc_s.

a subsequent frame release is safe, it will perform a btree lookup (on 
an empty btree), find nothing, and stop. Even frame registrations would 
be safe, they will just leads to leaked memory because the new nodes 
would not be released.
The only dangerous thing would be to require unwinding through frames 
that have been released by release_registered_frames, we would not find 
the exception handler. (And unwinding in concurrent threads would be 
racy, but I think they must not occur at shutdown anyway).
But if you want I can remove that destructor call, I just thought it 
would be nicer to release all memory.

> 
>> +#ifdef ATOMIC_FDE_FAST_PATH
>> +/* Get the PC range from FDEs */
>> +static void
>> +get_pc_range_from_fdes (const struct object *ob, const fde *this_fde,
>> +			uintptr_t *range)
>> +{
> 
> Would it be possible to share the bulk of the implementation with
> classify_object_over_fdes?

unfortunately classify_object_over_fdes modifies the struct object, and 
we must not do that. I thought about reusing the code, but it is not 
trivial to do. I can try harder if you want, but it will make the other 
code path uglier.

> 
>>    /* A linear search through a set of FDEs for the given PC.  This is
>>       used when there was insufficient memory to allocate and sort an
>>       array.  */
>> @@ -1033,17 +1154,12 @@ _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
>>      const fde *f = NULL;
>>    
>>    #ifdef ATOMIC_FDE_FAST_PATH
>> -  /* For targets where unwind info is usually not registered through these
>> -     APIs anymore, avoid taking a global lock.
>> -     Use relaxed MO here, it is up to the app to ensure that the library
>> -     loading/initialization happens-before using that library in other
>> -     threads (in particular unwinding with that library's functions
>> -     appearing in the backtraces).  Calling that library's functions
>> -     without waiting for the library to initialize would be racy.  */
>> -  if (__builtin_expect (!__atomic_load_n (&any_objects_registered,
>> -					  __ATOMIC_RELAXED), 1))
>> +  ob = btree_lookup (&registered_frames, (uintptr_t) pc);
>> +  if (!ob)
>>        return NULL;
>> -#endif
>> +
>> +  f = search_object (ob, pc);
>> +#else
> 
> I think we should keep any_objects_registered, to avoid the SEQ_CST
> fences in btree_lookup.  There might be some interest from the
> maintainers of weakly ordered architectures to move away from SEQ_CST to
> something else, but I assume that could be a follow-up change.

We can get the same effect by putting a

if (__builtin_expect(!__atomic_load_n (&(t->root),__ATOMIC_RELAXED), 1))
    return NULL;

at the beginning of btree_lookup. I can do that if you want. Note, 
though, that this requires some kind of application-level 
synchronization between threads using unwinding and threads calling 
__register_frame. Just as the existing code, the comment mentions that.


Best

Thomas
Florian Weimer March 3, 2022, 9:35 a.m. UTC | #3
* Thomas Neumann:

>>> +// Common logic for version locks
>>> +struct version_lock
>>> +{
>>> +  // The lock itself
>>> +  uintptr_t version_lock;
>>> +};
>> version_lock must not overflow, right?  This means we need a wider
>> counter on 32-bit, too.  glibc contains a 62-bit counter that it uses
>> for its own data structure.
>
> an overflow is not a problem per se, it is only problematic if we hit
> exactly the same value again between lock_optimistic and
> validate. Note that these ranges are usually a handful of assembler
> instructions, and we would have to see 4 billion frame registrations
> in that short time span. I don't think that is a problem in
> practice. But I can switch to 64bit, of course, if you think the risk
> is too high.

This is more of a GCC policy question, which I cannot answer.

At least it needs a comment explaining the decision to ignore overflows.

> But we could eliminate the spinlock aspect by using a global mutex,
> which would guarantee us that nothing is locked exclusively and thus
> no spinning is required. That would also allow us to fix the
> async-signal-safety at the same time if needed by blocking signals
> globally during updates.

Again, I don't know if people consider the spinning a problem.  In
glibc, we would use that mutex.

> Note that the current code is not async-signal-safe either, it simply
> grabs a mutex. If a signal happens while calling __register_frame, and 
> that handler tries to unwind, the current code will deadlock, too.

Yes, understood.

>>> +// Validate a previously acquire lock
>>> +static inline bool
>>> +version_lock_validate (const struct version_lock *vl, uintptr_t lock)
>>> +{
>>> +  // Check that the node is still in the same state
>>> +  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
>>> +  return (state == lock);
>>> +}
>> I think an acquire fence is missing before the __atomic_load_n.  We
>> learned this the hard way in the glibc implementation.  The reference
>> that Szabolcs found is:
>>       Hans Boehm, Can Seqlocks Get Along with Programming Language
>>       Memory Models?, Section 4.
>
> thanks for the pointer, I will look at this more carefully. When I
> read the text correctly we need a
> __atomic_thread_fence(__ATOMIC_ACQUIRE) before the load, but I will
> double check that.

The equivalent glibc function looks like this:

/* Return true if the read was successful, given the start
   version.  */
static inline bool
_dlfo_read_success (uint64_t start_version)
{
  /* See Hans Boehm, Can Seqlocks Get Along with Programming Language
     Memory Models?, Section 4.  This is necessary so that loads in
     the TM region are not ordered past the version check below.  */
  atomic_thread_fence_acquire ();

  /* Synchronizes with the fences in _dlfo_mappings_begin_update,
     _dlfo_mappings_end_update.  It is important that all stores from
     the last update have become visible, and stores from the next
     update to this version are not before the version number is
     updated.

     Unlike with seqlocks, there is no check for odd versions here
     because we have read the unmodified copy (confirmed to be
     unmodified by the unchanged version).  */
  return _dlfo_read_start_version () == start_version;
}

>>> +// Allocate a node. This node will be returned in locked exclusive state
>>> +static struct btree_node *
>>> +btree_allocate_node (struct btree *t, bool inner)
>>> +{
>> 
>>> +      // No free page available, allocate a new one
>>> +      struct btree_node *new_node
>>> +	= (struct btree_node *) (malloc (sizeof (struct btree_node)));
>>> +      version_lock_initialize_locked_exclusive (
>>> +	&(new_node->version_lock)); // initialize the node in locked state
>>> +      new_node->entry_count = 0;
>>> +      new_node->type = inner ? btree_node_inner : btree_node_leaf;
>>> +      return new_node;
>>> +    }
>>> +}
>> This needs some form of error checking for malloc.  But I see the
>> existing code does not have that, either. 8-(
>
> and I do not see how we can really handle a malloc failure here. What
> should we do except die?

We may have to add a new interface.  In some other cases, I've seen
errno being used for error reporting, but that requires changes in
applications to recognize the error.  It's probably better to crash here
than to fail mysteriously later.

Out of curiosity, how many times do you call the registration functions
from your application?  Would a batch registration interface help?  It
could address error reporting as well.

>>> +// Find the corresponding entry the given address
>>> +static struct object *
>>> +btree_lookup (const struct btree *t, uintptr_t target_addr)
>>> +{
>> 
>>> +      if (type == btree_node_inner)
>>> +	{
>>> +	  // We cannot call find_inner_slot here because we can only trust our
>>> +	  // validated entries
>>> +	  unsigned slot = 0;
>>> +	  while (((slot + 1) < entry_count)
>>> +		 && (iter->content.children[slot].separator < target_addr))
>>> +	    ++slot;
>>> +	  struct btree_node *child = iter->content.children[slot].child;
>>> +	  if (!btree_node_validate (iter, lock))
>>> +	    goto restart;
>> I don't understand the comment about not using find_inner_slot.
>> I think if you use separate free lists for inner nodes and leaf
>> nodes,
>> you never need to structurally modify the data.  Then you can avoid
>> validating on every level (with proper relaxed MO everywhere; right now
>> GCC might turn some of the array copies into memcpy, which may use
>> weaker-than-relaxed MO on some targets and actually produce
>> out-of-thin-air values).
>
> the problem is that, at least conceptually, we can read arbitrary
> garbage while a concurrent change is ongoing. In particular the
> n->element_case within find_inner_slot might return anything,
> including torn writes etc. The btree_lookup defends against that by
> using only validated values as loop limits, but the regular code does
> not. And instead of uglyfying the regular lookup_*_slot functions I
> preferred to duplicate that code with additional validate checks
> instead, that are only 3 lines anyway.

Technically the data races are still invalid, you cannot retroactively
undo them using the version check.

>>> +static void
>>> +release_registered_frames (void) __attribute__ ((destructor (110)));
>>> +static void
>>> +release_registered_frames (void)
>>> +{
>>> +  /* Release the b-tree and all frames. Frame releases that happen later are
>>> +   * silently ignored */
>>> +  btree_destroy (&registered_frames);
>>> +}
>> Is the comment correct?  Won't a subsequent frame release
>> necessarily
>> involve a use-after-free bug?  btree_destroy is only safe to call from a
>> quiesced process.  Deallocating these data structure is mainly relevant
>> for mtrace and valgrind --show-reachable=yes.  We could teach them to
>> call a new freeres hook in libgcc_s.
>
> a subsequent frame release is safe, it will perform a btree lookup (on
> an empty btree), find nothing, and stop. Even frame registrations
> would be safe, they will just leads to leaked memory because the new
> nodes would not be released.

Okay.

>>> +#ifdef ATOMIC_FDE_FAST_PATH
>>> +/* Get the PC range from FDEs */
>>> +static void
>>> +get_pc_range_from_fdes (const struct object *ob, const fde *this_fde,
>>> +			uintptr_t *range)
>>> +{
>> Would it be possible to share the bulk of the implementation with
>> classify_object_over_fdes?
>
> unfortunately classify_object_over_fdes modifies the struct object,
> and we must not do that. I thought about reusing the code, but it is
> not trivial to do. I can try harder if you want, but it will make the
> other code path uglier.

I see, please add a comment (on both copies of the code), as a hint to
keep it in sync.

>> I think we should keep any_objects_registered, to avoid the SEQ_CST
>> fences in btree_lookup.  There might be some interest from the
>> maintainers of weakly ordered architectures to move away from SEQ_CST to
>> something else, but I assume that could be a follow-up change.
>
> We can get the same effect by putting a
>
> if (__builtin_expect(!__atomic_load_n (&(t->root),__ATOMIC_RELAXED), 1))
>    return NULL;
>
> at the beginning of btree_lookup.

That's fine.  I want to make it obvious to architecture maintainers that
performance for the common case (no dynamic frames) does not change.

> Note, though, that this requires some kind of application-level
> synchronization between threads using unwinding and threads calling
> __register_frame. Just as the existing code, the comment mentions
> that.

Isn't that comment misleading?  Some synchronization is needed even with
the full locking to make sure that unwind data has been registered
before unwinding.  The main locking ensures that the data read is
consistent, but not that the data is there.

Thanks,
Florian
Thomas Neumann March 3, 2022, 9:51 a.m. UTC | #4
> We may have to add a new interface.  In some other cases, I've seen
> errno being used for error reporting, but that requires changes in
> applications to recognize the error.  It's probably better to crash here
> than to fail mysteriously later.
> 
> Out of curiosity, how many times do you call the registration functions
> from your application?  Would a batch registration interface help?  It
> could address error reporting as well.

we are compiling SQL queries into machine code, which means we call 
__(de)register_frame once per query. There is usually no way to batch 
that, as the queries typically come in one after the other.
In practice the system caches generated code, which means that most of 
the time are we executing an already compiled query with different 
parameters. Thus __register_frame is not a big performance problem for 
us, but unwinding is, as queries are executed in parallel using hundreds 
of threads and some of them fail.

Thank you for your detailed feedback, I will incorporate your 
suggestions and will send an updated patch in a few days.

Best

Thomas
diff mbox series

Patch

diff --git a/libgcc/unwind-dw2-btree.h b/libgcc/unwind-dw2-btree.h
new file mode 100644
index 00000000000..b16bb9994f8
--- /dev/null
+++ b/libgcc/unwind-dw2-btree.h
@@ -0,0 +1,870 @@ 
+/* Lock-free btree for manually registered unwind frames  */
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   Contributed by Thomas Neumann
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_UNWIND_DW2_BTREE_H
+#define GCC_UNWIND_DW2_BTREE_H
+
+#include <stdbool.h>
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility push(default)
+#endif
+
+// Common logic for version locks
+struct version_lock
+{
+  // The lock itself
+  uintptr_t version_lock;
+};
+
+// Initialize in locked state
+static inline void
+version_lock_initialize_locked_exclusive (struct version_lock *vl)
+{
+  vl->version_lock = 1;
+}
+
+// Try to lock the node exclusive
+static inline bool
+version_lock_try_lock_exclusive (struct version_lock *vl)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  if (state & 1)
+    return false;
+  return __atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+				      false, __ATOMIC_SEQ_CST,
+				      __ATOMIC_SEQ_CST);
+}
+
+// Lock the node exclusive, blocking as needed
+static void
+version_lock_lock_exclusive (struct version_lock *vl)
+{
+  // We should virtually never get contention here, as frame
+  // changes are rare. Thus we use a simple spinlock
+  while (true)
+    {
+      uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+      if (state & 1)
+	continue;
+      if (__atomic_compare_exchange_n (&(vl->version_lock), &state, state | 1,
+				       false, __ATOMIC_SEQ_CST,
+				       __ATOMIC_SEQ_CST))
+	return;
+    }
+}
+
+// Release a locked node and increase the version lock
+static inline void
+version_lock_unlock_exclusive (struct version_lock *vl)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  __atomic_store_n (&(vl->version_lock), (state + 2) & (~((uintptr_t) 1)),
+		    __ATOMIC_SEQ_CST);
+}
+
+// Acquire an optimistic "lock". Note that this does not lock at all, it
+// only allows for validation later
+static inline bool
+version_lock_lock_optimistic (const struct version_lock *vl, uintptr_t *lock)
+{
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  *lock = state;
+
+  // Acquire the lock fails when there is currently an exclusive lock
+  return !(state & 1);
+}
+
+// Validate a previously acquire lock
+static inline bool
+version_lock_validate (const struct version_lock *vl, uintptr_t lock)
+{
+  // Check that the node is still in the same state
+  uintptr_t state = __atomic_load_n (&(vl->version_lock), __ATOMIC_SEQ_CST);
+  return (state == lock);
+}
+
+// The largest possible separator value
+static const uintptr_t max_separator = ~((uintptr_t) (0));
+
+struct btree_node;
+
+// Inner entry. The child tree contains all entries <= separator
+struct inner_entry
+{
+  uintptr_t separator;
+  struct btree_node *child;
+};
+
+// Leaf entry. Stores an object entry
+struct leaf_entry
+{
+  uintptr_t base, size;
+  struct object *ob;
+};
+
+// node types
+enum node_type
+{
+  btree_node_inner,
+  btree_node_leaf,
+  btree_node_free
+};
+
+// Node sizes. Choosen such that the result size if roughly 256 bytes
+#define max_fanout_inner 15
+#define max_fanout_leaf 10
+
+// A btree node
+struct btree_node
+{
+  // The version lock used for optimistic lock coupling
+  struct version_lock version_lock;
+  // The number of entries
+  unsigned entry_count;
+  // The type
+  enum node_type type;
+  // The payload
+  union
+  {
+    // The inner nodes have fence keys, i.e., the right-most entry includes a
+    // separator
+    struct inner_entry children[max_fanout_inner];
+    struct leaf_entry entries[max_fanout_leaf];
+  } content;
+};
+
+// Is an inner node?
+static inline bool
+btree_node_is_inner (const struct btree_node *n)
+{
+  return n->type == btree_node_inner;
+}
+
+// Is a leaf node?
+static inline bool
+btree_node_is_leaf (const struct btree_node *n)
+{
+  return n->type == btree_node_leaf;
+}
+
+// Should the node be merged?
+static inline bool
+btree_node_needs_merge (const struct btree_node *n)
+{
+  return n->entry_count < (btree_node_is_inner (n) ? (max_fanout_inner / 2)
+						   : (max_fanout_leaf / 2));
+}
+
+// Get the fence key for inner nodes
+static inline uintptr_t
+btree_node_get_fence_key (const struct btree_node *n)
+{
+  // For inner nodes we just return our right-most entry
+  return n->content.children[n->entry_count - 1].separator;
+}
+
+// Find the position for a slot in an inner node
+static unsigned
+btree_node_find_inner_slot (const struct btree_node *n, uintptr_t value)
+{
+  for (unsigned index = 0, ec = n->entry_count; index != ec; ++index)
+    if (n->content.children[index].separator >= value)
+      return index;
+  return n->entry_count;
+}
+
+// Find the position for a slot in a leaf node
+static unsigned
+btree_node_find_leaf_slot (const struct btree_node *n, uintptr_t value)
+{
+  for (unsigned index = 0, ec = n->entry_count; index != ec; ++index)
+    if (n->content.entries[index].base + n->content.entries[index].size > value)
+      return index;
+  return n->entry_count;
+}
+
+// Try to lock the node exclusive
+static inline bool
+btree_node_try_lock_exclusive (struct btree_node *n)
+{
+  return version_lock_try_lock_exclusive (&(n->version_lock));
+}
+
+// Lock the node exclusive, blocking as needed
+static inline void
+btree_node_lock_exclusive (struct btree_node *n)
+{
+  version_lock_lock_exclusive (&(n->version_lock));
+}
+
+// Release a locked node and increase the version lock
+static inline void
+btree_node_unlock_exclusive (struct btree_node *n)
+{
+  version_lock_unlock_exclusive (&(n->version_lock));
+}
+
+// Acquire an optimistic "lock". Note that this does not lock at all, it
+// only allows for validation later
+static inline bool
+btree_node_lock_optimistic (const struct btree_node *n, uintptr_t *lock)
+{
+  return version_lock_lock_optimistic (&(n->version_lock), lock);
+}
+
+// Validate a previously acquire lock
+static inline bool
+btree_node_validate (const struct btree_node *n, uintptr_t lock)
+{
+  return version_lock_validate (&(n->version_lock), lock);
+}
+
+// Insert a new separator after splitting
+static void
+btree_node_update_separator_after_split (struct btree_node *n,
+					 uintptr_t old_separator,
+					 uintptr_t new_separator,
+					 struct btree_node *new_right)
+{
+  unsigned slot = btree_node_find_inner_slot (n, old_separator);
+  for (unsigned index = n->entry_count; index > slot; --index)
+    n->content.children[index] = n->content.children[index - 1];
+  n->content.children[slot].separator = new_separator;
+  n->content.children[slot + 1].child = new_right;
+  n->entry_count++;
+}
+
+// A btree. Suitable for static initialization, all members are zero at the
+// beginning
+struct btree
+{
+  // The root of the btree
+  struct btree_node *root;
+  // The free list of released node
+  struct btree_node *free_list;
+  // The version lock used to protect the root
+  struct version_lock root_lock;
+};
+
+// Initialize a btree. Not actually used, just for exposition
+static inline void
+btree_init (struct btree *t)
+{
+  t->root = NULL;
+  t->free_list = NULL;
+  t->root_lock.version_lock = 0;
+};
+
+static void
+btree_release_tree_recursively (struct btree *t, struct btree_node *n);
+
+// Destroy a tree and release all nodes. Not used currently, but could be called
+// at shutdown to destroy the frame lookup
+static void
+btree_destroy (struct btree *t)
+{
+  // Disable the mechanism before cleaning up
+  struct btree_node *old_root
+    = __atomic_exchange_n (&(t->root), NULL, __ATOMIC_SEQ_CST);
+  if (old_root)
+    btree_release_tree_recursively (t, old_root);
+
+  // Release all free pages
+  while (t->free_list)
+    {
+      struct btree_node *next = t->free_list->content.children[0].child;
+      free (t->free_list);
+      t->free_list = next;
+    }
+}
+
+// Allocate a node. This node will be returned in locked exclusive state
+static struct btree_node *
+btree_allocate_node (struct btree *t, bool inner)
+{
+  while (true)
+    {
+      // Try the free list first
+      struct btree_node *next_free
+	= __atomic_load_n (&(t->free_list), __ATOMIC_SEQ_CST);
+      if (next_free)
+	{
+	  if (!btree_node_try_lock_exclusive (next_free))
+	    continue;
+	  // The node might no longer be free, check that again after acquiring
+	  // the exclusive lock
+	  if (next_free->type == btree_node_free)
+	    {
+	      struct btree_node *ex = next_free;
+	      if (__atomic_compare_exchange_n (
+		    &(t->free_list), &ex, next_free->content.children[0].child,
+		    false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+		{
+		  next_free->entry_count = 0;
+		  next_free->type = inner ? btree_node_inner : btree_node_leaf;
+		  return next_free;
+		}
+	    }
+	  btree_node_unlock_exclusive (next_free);
+	  continue;
+	}
+
+      // No free page available, allocate a new one
+      struct btree_node *new_node
+	= (struct btree_node *) (malloc (sizeof (struct btree_node)));
+      version_lock_initialize_locked_exclusive (
+	&(new_node->version_lock)); // initialize the node in locked state
+      new_node->entry_count = 0;
+      new_node->type = inner ? btree_node_inner : btree_node_leaf;
+      return new_node;
+    }
+}
+
+// Release a node. This node must be currently locked exclusively and will
+// be placed in the free list
+static void
+btree_release_node (struct btree *t, struct btree_node *node)
+{
+  // We cannot release the memory immediately because there might still be
+  // concurrent readers on that node. Put it in the free list instead
+  node->type = btree_node_free;
+  struct btree_node *next_free
+    = __atomic_load_n (&(t->free_list), __ATOMIC_SEQ_CST);
+  do
+    {
+      node->content.children[0].child = next_free;
+  } while (!__atomic_compare_exchange_n (&(t->free_list), &next_free, node,
+					 false, __ATOMIC_SEQ_CST,
+					 __ATOMIC_SEQ_CST));
+  btree_node_unlock_exclusive (node);
+}
+
+// Recursive release a tree. The btree is by design very shallow, thus
+// we can risk recursion here
+static void
+btree_release_tree_recursively (struct btree *t, struct btree_node *node)
+{
+  btree_node_lock_exclusive (node);
+  if (btree_node_is_inner (node))
+    {
+      for (unsigned index = 0; index < node->entry_count; ++index)
+	btree_release_tree_recursively (t, node->content.children[index].child);
+    }
+  btree_release_node (t, node);
+}
+
+// Check if we are splitting the root
+static void
+btree_handle_root_split (struct btree *t, struct btree_node **node,
+			 struct btree_node **parent)
+{
+  // We want to keep the root pointer stable to allow for contention
+  // free reads. Thus, we split the root by first moving the content
+  // of the root node to a new node, and then split that new node
+  if (!*parent)
+    {
+      // Allocate a new node, we guarantees us that we will have a parent
+      // afterwards
+      struct btree_node *new_node
+	= btree_allocate_node (t, btree_node_is_inner (*node));
+      struct btree_node *old_node = *node;
+      new_node->entry_count = old_node->entry_count;
+      new_node->content = old_node->content;
+      old_node->content.children[0].separator = max_separator;
+      old_node->content.children[0].child = new_node;
+      old_node->entry_count = 1;
+      old_node->type = btree_node_inner;
+
+      *parent = old_node;
+      *node = new_node;
+    }
+}
+
+// Split an inner node
+static void
+btree_split_inner (struct btree *t, struct btree_node **inner,
+		   struct btree_node **parent, uintptr_t target)
+{
+  // Check for the root
+  btree_handle_root_split (t, inner, parent);
+
+  // Create two inner node
+  uintptr_t right_fence = btree_node_get_fence_key (*inner);
+  struct btree_node *left_inner = *inner;
+  struct btree_node *right_inner = btree_allocate_node (t, true);
+  unsigned split = left_inner->entry_count / 2;
+  right_inner->entry_count = left_inner->entry_count - split;
+  for (unsigned index = 0; index < right_inner->entry_count; ++index)
+    right_inner->content.children[index]
+      = left_inner->content.children[split + index];
+  left_inner->entry_count = split;
+  uintptr_t left_fence = btree_node_get_fence_key (left_inner);
+  btree_node_update_separator_after_split (*parent, right_fence, left_fence,
+					   right_inner);
+  if (target <= left_fence)
+    {
+      *inner = left_inner;
+      btree_node_unlock_exclusive (right_inner);
+    }
+  else
+    {
+      *inner = right_inner;
+      btree_node_unlock_exclusive (left_inner);
+    }
+}
+
+// Split a leaf node
+static void
+btree_split_leaf (struct btree *t, struct btree_node **leaf,
+		  struct btree_node **parent, uintptr_t fence, uintptr_t target)
+{
+  // Check for the root
+  btree_handle_root_split (t, leaf, parent);
+
+  // Create two leaf node
+  uintptr_t right_fence = fence;
+  struct btree_node *left_leaf = *leaf;
+  struct btree_node *right_leaf = btree_allocate_node (t, false);
+  unsigned split = left_leaf->entry_count / 2;
+  right_leaf->entry_count = left_leaf->entry_count - split;
+  for (unsigned index = 0; index != right_leaf->entry_count; ++index)
+    right_leaf->content.entries[index]
+      = left_leaf->content.entries[split + index];
+  left_leaf->entry_count = split;
+  uintptr_t left_fence = right_leaf->content.entries[0].base - 1;
+  btree_node_update_separator_after_split (*parent, right_fence, left_fence,
+					   right_leaf);
+  if (target <= left_fence)
+    {
+      *leaf = left_leaf;
+      btree_node_unlock_exclusive (right_leaf);
+    }
+  else
+    {
+      *leaf = right_leaf;
+      btree_node_unlock_exclusive (left_leaf);
+    }
+}
+
+// Merge (or balance) child nodes
+static struct btree_node *
+btree_merge_node (struct btree *t, unsigned child_slot,
+		  struct btree_node *parent, uintptr_t target)
+{
+  // Choose the emptiest neighbor and lock both. The target child is already
+  // locked
+  unsigned left_slot;
+  struct btree_node *left_node, *right_node;
+  if ((child_slot == 0)
+      || (((child_slot + 1) < parent->entry_count)
+	  && (parent->content.children[child_slot + 1].child->entry_count
+	      < parent->content.children[child_slot - 1].child->entry_count)))
+    {
+      left_slot = child_slot;
+      left_node = parent->content.children[left_slot].child;
+      right_node = parent->content.children[left_slot + 1].child;
+      btree_node_lock_exclusive (right_node);
+    }
+  else
+    {
+      left_slot = child_slot - 1;
+      left_node = parent->content.children[left_slot].child;
+      right_node = parent->content.children[left_slot + 1].child;
+      btree_node_lock_exclusive (left_node);
+    }
+
+  // Can we merge both nodes into one node?
+  unsigned total_count = left_node->entry_count + right_node->entry_count;
+  unsigned max_count
+    = btree_node_is_inner (left_node) ? max_fanout_inner : max_fanout_leaf;
+  if (total_count <= max_count)
+    {
+      // Merge into the parent?
+      if (parent->entry_count == 2)
+	{
+	  // Merge children into parent. This can only happen at the root
+	  if (btree_node_is_inner (left_node))
+	    {
+	      for (unsigned index = 0; index != left_node->entry_count; ++index)
+		parent->content.children[index]
+		  = left_node->content.children[index];
+	      for (unsigned index = 0; index != right_node->entry_count;
+		   ++index)
+		parent->content.children[index + left_node->entry_count]
+		  = right_node->content.children[index];
+	    }
+	  else
+	    {
+	      parent->type = btree_node_leaf;
+	      for (unsigned index = 0; index != left_node->entry_count; ++index)
+		parent->content.entries[index]
+		  = left_node->content.entries[index];
+	      for (unsigned index = 0; index != right_node->entry_count;
+		   ++index)
+		parent->content.entries[index + left_node->entry_count]
+		  = right_node->content.entries[index];
+	    }
+	  parent->entry_count = total_count;
+	  btree_release_node (t, left_node);
+	  btree_release_node (t, right_node);
+	  return parent;
+	}
+      else
+	{
+	  // Regular merge
+	  if (btree_node_is_inner (left_node))
+	    {
+	      for (unsigned index = 0; index != right_node->entry_count;
+		   ++index)
+		left_node->content.children[left_node->entry_count++]
+		  = right_node->content.children[index];
+	    }
+	  else
+	    {
+	      for (unsigned index = 0; index != right_node->entry_count;
+		   ++index)
+		left_node->content.entries[left_node->entry_count++]
+		  = right_node->content.entries[index];
+	    }
+	  parent->content.children[left_slot].separator
+	    = parent->content.children[left_slot + 1].separator;
+	  for (unsigned index = left_slot + 1; index + 1 < parent->entry_count;
+	       ++index)
+	    parent->content.children[index]
+	      = parent->content.children[index + 1];
+	  parent->entry_count--;
+	  btree_release_node (t, right_node);
+	  btree_node_unlock_exclusive (parent);
+	  return left_node;
+	}
+    }
+
+  // No merge possible, rebalance instead
+  if (left_node->entry_count > right_node->entry_count)
+    {
+      // Shift from left to right
+      unsigned to_shift
+	= (left_node->entry_count - right_node->entry_count) / 2;
+      if (btree_node_is_inner (left_node))
+	{
+	  for (unsigned index = 0; index != right_node->entry_count; ++index)
+	    {
+	      unsigned pos = right_node->entry_count - 1 - index;
+	      right_node->content.children[pos + to_shift]
+		= right_node->content.children[pos];
+	    }
+	  for (unsigned index = 0; index != to_shift; ++index)
+	    right_node->content.children[index]
+	      = left_node->content
+		  .children[left_node->entry_count - to_shift + index];
+	}
+      else
+	{
+	  for (unsigned index = 0; index != right_node->entry_count; ++index)
+	    {
+	      unsigned pos = right_node->entry_count - 1 - index;
+	      right_node->content.entries[pos + to_shift]
+		= right_node->content.entries[pos];
+	    }
+	  for (unsigned index = 0; index != to_shift; ++index)
+	    right_node->content.entries[index]
+	      = left_node->content
+		  .entries[left_node->entry_count - to_shift + index];
+	}
+      left_node->entry_count -= to_shift;
+      right_node->entry_count += to_shift;
+    }
+  else
+    {
+      // Shift from right to left
+      unsigned to_shift
+	= (right_node->entry_count - left_node->entry_count) / 2;
+      if (btree_node_is_inner (left_node))
+	{
+	  for (unsigned index = 0; index != to_shift; ++index)
+	    left_node->content.children[left_node->entry_count + index]
+	      = right_node->content.children[index];
+	  for (unsigned index = 0; index != right_node->entry_count - to_shift;
+	       ++index)
+	    right_node->content.children[index]
+	      = right_node->content.children[index + to_shift];
+	}
+      else
+	{
+	  for (unsigned index = 0; index != to_shift; ++index)
+	    left_node->content.entries[left_node->entry_count + index]
+	      = right_node->content.entries[index];
+	  for (unsigned index = 0; index != right_node->entry_count - to_shift;
+	       ++index)
+	    right_node->content.entries[index]
+	      = right_node->content.entries[index + to_shift];
+	}
+      left_node->entry_count += to_shift;
+      right_node->entry_count -= to_shift;
+    }
+  uintptr_t left_fence;
+  if (btree_node_is_leaf (left_node))
+    {
+      left_fence = right_node->content.entries[0].base - 1;
+    }
+  else
+    {
+      left_fence = btree_node_get_fence_key (left_node);
+    }
+  parent->content.children[left_slot].separator = left_fence;
+  btree_node_unlock_exclusive (parent);
+  if (target <= left_fence)
+    {
+      btree_node_unlock_exclusive (right_node);
+      return left_node;
+    }
+  else
+    {
+      btree_node_unlock_exclusive (left_node);
+      return right_node;
+    }
+}
+
+// Insert an entry
+static bool
+btree_insert (struct btree *t, uintptr_t base, uintptr_t size,
+	      struct object *ob)
+{
+  // Sanity check
+  if (!size)
+    return false;
+
+  // Access the root
+  struct btree_node *iter, *parent = NULL;
+  {
+    version_lock_lock_exclusive (&(t->root_lock));
+    iter = t->root;
+    if (iter)
+      {
+	btree_node_lock_exclusive (iter);
+      }
+    else
+      {
+	t->root = iter = btree_allocate_node (t, false);
+      }
+    version_lock_unlock_exclusive (&(t->root_lock));
+  }
+
+  // Walk down the btree with classic lock coupling and eager splits.
+  // Strictly speaking this is not performance optimal, we could use
+  // optimistic lock coupling until we hit a node that has to be modified.
+  // But that is more difficult to implement and frame registration is
+  // rare anyway, we use simple locking for now
+
+  uintptr_t fence = max_separator;
+  while (btree_node_is_inner (iter))
+    {
+      // Use eager splits to avoid lock coupling up
+      if (iter->entry_count == max_fanout_inner)
+	btree_split_inner (t, &iter, &parent, base);
+
+      unsigned slot = btree_node_find_inner_slot (iter, base);
+      if (parent)
+	btree_node_unlock_exclusive (parent);
+      parent = iter;
+      fence = iter->content.children[slot].separator;
+      iter = iter->content.children[slot].child;
+      btree_node_lock_exclusive (iter);
+    }
+
+  // Make sure we have space
+  if (iter->entry_count == max_fanout_leaf)
+    btree_split_leaf (t, &iter, &parent, fence, base);
+  if (parent)
+    btree_node_unlock_exclusive (parent);
+
+  // Insert in page
+  unsigned slot = btree_node_find_leaf_slot (iter, base);
+  if ((slot < iter->entry_count) && (iter->content.entries[slot].base == base))
+    {
+      // duplicate entry, this should never happen
+      btree_node_unlock_exclusive (iter);
+      return false;
+    }
+  for (unsigned index = iter->entry_count; index > slot; --index)
+    iter->content.entries[index] = iter->content.entries[index - 1];
+  struct leaf_entry *e = &(iter->content.entries[slot]);
+  e->base = base;
+  e->size = size;
+  e->ob = ob;
+  iter->entry_count++;
+  btree_node_unlock_exclusive (iter);
+  return true;
+}
+
+// Remove an entry
+static struct object *
+btree_remove (struct btree *t, uintptr_t base)
+{
+  // Access the root
+  version_lock_lock_exclusive (&(t->root_lock));
+  struct btree_node *iter = t->root;
+  if (iter)
+    btree_node_lock_exclusive (iter);
+  version_lock_unlock_exclusive (&(t->root_lock));
+  if (!iter)
+    return NULL;
+
+  // Same strategy as with insert, walk down with lock coupling and
+  // merge eagerly
+  while (btree_node_is_inner (iter))
+    {
+      unsigned slot = btree_node_find_inner_slot (iter, base);
+      struct btree_node *next = iter->content.children[slot].child;
+      btree_node_lock_exclusive (next);
+      if (btree_node_needs_merge (next))
+	{
+	  // Use eager merges to avoid lock coupling up
+	  iter = btree_merge_node (t, slot, iter, base);
+	}
+      else
+	{
+	  btree_node_unlock_exclusive (iter);
+	  iter = next;
+	}
+    }
+
+  // Remove existing entry
+  unsigned slot = btree_node_find_leaf_slot (iter, base);
+  if ((slot >= iter->entry_count) || (iter->content.entries[slot].base != base))
+    {
+      // not found, this should never happen
+      btree_node_unlock_exclusive (iter);
+      return NULL;
+    }
+  struct object *ob = iter->content.entries[slot].ob;
+  for (unsigned index = slot; index + 1 < iter->entry_count; ++index)
+    iter->content.entries[index] = iter->content.entries[index + 1];
+  iter->entry_count--;
+  btree_node_unlock_exclusive (iter);
+  return ob;
+}
+
+// Find the corresponding entry the given address
+static struct object *
+btree_lookup (const struct btree *t, uintptr_t target_addr)
+{
+  // The unwinding tables are mostly static, they only change when
+  // frames are added or removed. This makes it extremely unlikely that they
+  // change during a given unwinding sequence. Thus, we optimize for the
+  // contention free case and use optimistic lock coupling. This does not
+  // require any writes to shared state, instead we validate every read. It is
+  // important that we do not trust any value that we have read until we call
+  // validate again. Data can change at arbitrary points in time, thus we always
+  // copy something into a local variable and validate again before acting on
+  // the read. In the unlikely event that we encounter a concurrent change we
+  // simply restart and try again.
+
+restart:
+  struct btree_node *iter;
+  uintptr_t lock;
+  {
+    // Accessing the root node requires defending against concurrent pointer
+    // changes Thus we couple rootLock -> lock on root node -> validate rootLock
+    if (!version_lock_lock_optimistic (&(t->root_lock), &lock))
+      goto restart;
+    iter = t->root;
+    if (!version_lock_validate (&(t->root_lock), lock))
+      goto restart;
+    if (!iter)
+      return NULL;
+    uintptr_t child_lock;
+    if ((!btree_node_lock_optimistic (iter, &child_lock))
+	|| (!version_lock_validate (&(t->root_lock), lock)))
+      goto restart;
+    lock = child_lock;
+  }
+
+  // Now we can walk down towards the right leaf node
+  while (true)
+    {
+      enum node_type type = iter->type;
+      unsigned entry_count = iter->entry_count;
+      if (!btree_node_validate (iter, lock))
+	goto restart;
+      if (!entry_count)
+	return NULL;
+
+      if (type == btree_node_inner)
+	{
+	  // We cannot call find_inner_slot here because we can only trust our
+	  // validated entries
+	  unsigned slot = 0;
+	  while (((slot + 1) < entry_count)
+		 && (iter->content.children[slot].separator < target_addr))
+	    ++slot;
+	  struct btree_node *child = iter->content.children[slot].child;
+	  if (!btree_node_validate (iter, lock))
+	    goto restart;
+
+	  // The node content can change at any point in time, thus we must
+	  // interleave parent and child checks
+	  uintptr_t child_lock;
+	  if (!btree_node_lock_optimistic (child, &child_lock))
+	    goto restart;
+	  if (!btree_node_validate (iter, lock))
+	    goto restart; // make sure we still point to the correct node after
+			  // acquiring the optimistic lock
+
+	  // Go down
+	  iter = child;
+	  lock = child_lock;
+	}
+      else
+	{
+	  // We cannot call find_leaf_slot here because we can only trust our
+	  // validated entries
+	  unsigned slot = 0;
+	  while (((slot + 1) < entry_count)
+		 && (iter->content.entries[slot].base
+		       + iter->content.entries[slot].size
+		     <= target_addr))
+	    ++slot;
+	  struct leaf_entry entry = iter->content.entries[slot];
+	  if (!btree_node_validate (iter, lock))
+	    goto restart;
+
+	  // Check if we have a hit
+	  if ((entry.base <= target_addr)
+	      && (target_addr < entry.base + entry.size))
+	    {
+	      return entry.ob;
+	    }
+	  return NULL;
+	}
+    }
+}
+
+#ifndef HIDE_EXPORTS
+#pragma GCC visibility pop
+#endif
+
+#endif /* unwind-dw2-btree.h */
diff --git a/libgcc/unwind-dw2-fde.c b/libgcc/unwind-dw2-fde.c
index 8ee55be5675..56be596713b 100644
--- a/libgcc/unwind-dw2-fde.c
+++ b/libgcc/unwind-dw2-fde.c
@@ -42,15 +42,34 @@  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  #endif
  #endif
  
+#ifdef ATOMIC_FDE_FAST_PATH
+#include "unwind-dw2-btree.h"
+
+static struct btree registered_frames;
+
+static void
+release_registered_frames (void) __attribute__ ((destructor (110)));
+static void
+release_registered_frames (void)
+{
+  /* Release the b-tree and all frames. Frame releases that happen later are
+   * silently ignored */
+  btree_destroy (&registered_frames);
+}
+
+static void
+get_pc_range (const struct object *ob, uintptr_t *range);
+static void
+init_object (struct object *ob);
+
+#else
+
  /* The unseen_objects list contains objects that have been registered
     but not yet categorized in any way.  The seen_objects list has had
     its pc_begin and count fields initialized at minimum, and is sorted
     by decreasing value of pc_begin.  */
  static struct object *unseen_objects;
  static struct object *seen_objects;
-#ifdef ATOMIC_FDE_FAST_PATH
-static int any_objects_registered;
-#endif
  
  #ifdef __GTHREAD_MUTEX_INIT
  static __gthread_mutex_t object_mutex = __GTHREAD_MUTEX_INIT;
@@ -78,6 +97,7 @@  init_object_mutex_once (void)
  static __gthread_mutex_t object_mutex;
  #endif
  #endif
+#endif
  
  /* Called from crtbegin.o to register the unwind info for an object.  */
  
@@ -99,23 +119,23 @@  __register_frame_info_bases (const void *begin, struct object *ob,
    ob->fde_end = NULL;
  #endif
  
+#ifdef ATOMIC_FDE_FAST_PATH
+  // Initialize  eagerly to avoid locking later
+  init_object (ob);
+
+  // And register the frame
+  uintptr_t range[2];
+  get_pc_range (ob, range);
+  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
+#else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
  
    ob->next = unseen_objects;
    unseen_objects = ob;
-#ifdef ATOMIC_FDE_FAST_PATH
-  /* Set flag that at least one library has registered FDEs.
-     Use relaxed MO here, it is up to the app to ensure that the library
-     loading/initialization happens-before using that library in other
-     threads (in particular unwinding with that library's functions
-     appearing in the backtraces).  Calling that library's functions
-     without waiting for the library to initialize would be racy.  */
-  if (!any_objects_registered)
-    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
-#endif
  
    __gthread_mutex_unlock (&object_mutex);
+#endif
  }
  
  void
@@ -153,23 +173,23 @@  __register_frame_info_table_bases (void *begin, struct object *ob,
    ob->s.b.from_array = 1;
    ob->s.b.encoding = DW_EH_PE_omit;
  
+#ifdef ATOMIC_FDE_FAST_PATH
+  // Initialize  eagerly to avoid locking later
+  init_object (ob);
+
+  // And register the frame
+  uintptr_t range[2];
+  get_pc_range (ob, range);
+  btree_insert (&registered_frames, range[0], range[1] - range[0], ob);
+#else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
  
    ob->next = unseen_objects;
    unseen_objects = ob;
-#ifdef ATOMIC_FDE_FAST_PATH
-  /* Set flag that at least one library has registered FDEs.
-     Use relaxed MO here, it is up to the app to ensure that the library
-     loading/initialization happens-before using that library in other
-     threads (in particular unwinding with that library's functions
-     appearing in the backtraces).  Calling that library's functions
-     without waiting for the library to initialize would be racy.  */
-  if (!any_objects_registered)
-    __atomic_store_n (&any_objects_registered, 1, __ATOMIC_RELAXED);
-#endif
  
    __gthread_mutex_unlock (&object_mutex);
+#endif
  }
  
  void
@@ -200,16 +220,33 @@  __register_frame_table (void *begin)
  void *
  __deregister_frame_info_bases (const void *begin)
  {
-  struct object **p;
    struct object *ob = 0;
  
    /* If .eh_frame is empty, we haven't registered.  */
    if ((const uword *) begin == 0 || *(const uword *) begin == 0)
      return ob;
  
+#ifdef ATOMIC_FDE_FAST_PATH
+  // Find the corresponding PC range
+  struct object lookupob;
+  lookupob.tbase = 0;
+  lookupob.dbase = 0;
+  lookupob.u.single = begin;
+  lookupob.s.i = 0;
+  lookupob.s.b.encoding = DW_EH_PE_omit;
+#ifdef DWARF2_OBJECT_END_PTR_EXTENSION
+  lookupob.fde_end = NULL;
+#endif
+  uintptr_t range[2];
+  get_pc_range (&lookupob, range);
+
+  // And remove
+  ob = btree_remove (&registered_frames, range[0]);
+#else
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
  
+  struct object **p;
    for (p = &unseen_objects; *p ; p = &(*p)->next)
      if ((*p)->u.single == begin)
        {
@@ -241,6 +278,8 @@  __deregister_frame_info_bases (const void *begin)
  
   out:
    __gthread_mutex_unlock (&object_mutex);
+#endif
+
    gcc_assert (ob);
    return (void *) ob;
  }
@@ -264,7 +303,7 @@  __deregister_frame (void *begin)
     instead of an _Unwind_Context.  */
  
  static _Unwind_Ptr
-base_from_object (unsigned char encoding, struct object *ob)
+base_from_object (unsigned char encoding, const struct object *ob)
  {
    if (encoding == DW_EH_PE_omit)
      return 0;
@@ -821,6 +860,88 @@  init_object (struct object* ob)
    ob->s.b.sorted = 1;
  }
  
+#ifdef ATOMIC_FDE_FAST_PATH
+/* Get the PC range from FDEs */
+static void
+get_pc_range_from_fdes (const struct object *ob, const fde *this_fde,
+			uintptr_t *range)
+{
+  const struct dwarf_cie *last_cie = 0;
+  int encoding = DW_EH_PE_absptr;
+  _Unwind_Ptr base = 0;
+
+  for (; !last_fde (ob, this_fde); this_fde = next_fde (this_fde))
+    {
+      const struct dwarf_cie *this_cie;
+      _Unwind_Ptr mask, pc_begin, pc_range;
+
+      /* Skip CIEs.  */
+      if (this_fde->CIE_delta == 0)
+	continue;
+
+      this_cie = get_cie (this_fde);
+      if (this_cie != last_cie)
+	{
+	  last_cie = this_cie;
+	  encoding = get_cie_encoding (this_cie);
+	  base = base_from_object (encoding, ob);
+	}
+
+      const unsigned char *p;
+      p = read_encoded_value_with_base (encoding, base, this_fde->pc_begin,
+					&pc_begin);
+      read_encoded_value_with_base (encoding & 0x0F, 0, p, &pc_range);
+
+      /* Take care to ignore link-once functions that were removed.
+	 In these cases, the function address will be NULL, but if
+	 the encoding is smaller than a pointer a true NULL may not
+	 be representable.  Assume 0 in the representable bits is NULL.  */
+      mask = size_of_encoded_value (encoding);
+      if (mask < sizeof (void *))
+	mask = (((_Unwind_Ptr) 1) << (mask << 3)) - 1;
+      else
+	mask = -1;
+      if ((pc_begin & mask) == 0)
+	continue;
+
+      _Unwind_Ptr pc_end = pc_begin + pc_range;
+      if ((!range[0]) && (!range[1]))
+	{
+	  range[0] = pc_begin;
+	  range[1] = pc_end;
+	}
+      else
+	{
+	  if (pc_begin < range[0])
+	    range[0] = pc_begin;
+	  if (pc_end > range[1])
+	    range[1] = pc_end;
+	}
+    }
+}
+
+/* Get the PC range for lookup */
+static void
+get_pc_range (const struct object *ob, uintptr_t *range)
+{
+  range[0] = range[1] = 0;
+  if (ob->s.b.sorted)
+    {
+      get_pc_range_from_fdes (ob, ob->u.sort->orig_data, range);
+    }
+  else if (ob->s.b.from_array)
+    {
+      fde **p = ob->u.array;
+      for (; *p; ++p)
+	get_pc_range_from_fdes (ob, *p, range);
+    }
+  else
+    {
+      get_pc_range_from_fdes (ob, ob->u.single, range);
+    }
+}
+#endif
+
  /* A linear search through a set of FDEs for the given PC.  This is
     used when there was insufficient memory to allocate and sort an
     array.  */
@@ -1033,17 +1154,12 @@  _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
    const fde *f = NULL;
  
  #ifdef ATOMIC_FDE_FAST_PATH
-  /* For targets where unwind info is usually not registered through these
-     APIs anymore, avoid taking a global lock.
-     Use relaxed MO here, it is up to the app to ensure that the library
-     loading/initialization happens-before using that library in other
-     threads (in particular unwinding with that library's functions
-     appearing in the backtraces).  Calling that library's functions
-     without waiting for the library to initialize would be racy.  */
-  if (__builtin_expect (!__atomic_load_n (&any_objects_registered,
-					  __ATOMIC_RELAXED), 1))
+  ob = btree_lookup (&registered_frames, (uintptr_t) pc);
+  if (!ob)
      return NULL;
-#endif
+
+  f = search_object (ob, pc);
+#else
  
    init_object_mutex_once ();
    __gthread_mutex_lock (&object_mutex);
@@ -1081,6 +1197,7 @@  _Unwind_Find_FDE (void *pc, struct dwarf_eh_bases *bases)
  
   fini:
    __gthread_mutex_unlock (&object_mutex);
+#endif
  
    if (f)
      {
diff --git a/libgcc/unwind-dw2-fde.h b/libgcc/unwind-dw2-fde.h
index 8a011c358b4..77c2caa4f5a 100644
--- a/libgcc/unwind-dw2-fde.h
+++ b/libgcc/unwind-dw2-fde.h
@@ -166,7 +166,7 @@  next_fde (const fde *f)
  extern const fde * _Unwind_Find_FDE (void *, struct dwarf_eh_bases *);
  
  static inline int
-last_fde (struct object *obj __attribute__ ((__unused__)), const fde *f)
+last_fde (const struct object *obj __attribute__ ((__unused__)), const fde *f)
  {
  #ifdef DWARF2_OBJECT_END_PTR_EXTENSION
    return f == (const fde *) obj->fde_end || f->length == 0;