[3/3] Refactor atfork handlers

Message ID 1518008967-8310-3-git-send-email-adhemerval.zanella@linaro.org
State New
Headers show
Series
  • [1/3] Refactor Linux ARCH_FORK implementation
Related show

Commit Message

Adhemerval Zanella Feb. 7, 2018, 1:09 p.m.
Current implementation (sysdeps/nptl/fork.c) replicates the atfork
handlers list backward to invoke the child handlers after fork/clone
syscall.

The internal atfork handlers is implemented as a single-linked list
so a lock-free algorithm can be used, trading fork mulithread call
performance for some code complexity and dynamic stack allocation
(since the backwards list should not fail).

This patch refactor it to use a dynarary instead of a linked list.
It simplifies the external variables need to be exported and also
the internal atfork handler member definition.

The downside is a serialization of fork call in multithread, since to
operate on the dynarray the internal lock should be used.  However
as noted by Florian, it already acquires external locks for malloc
and libio so it is already hitting some lock contention.  Besides,
posix_spawn should be faster and more scalable to run external programs
in multithread environments.

Checked on x86_64-linux-gnu.

	* nptl/Makefile (routines): Remove unregister-atfork.
	* nptl/register-atfork.c (fork_handler_pool): Remove variable.
	(fork_handler_alloc): Remove function.
	(fork_handlers, fork_handler_init): New variables.
	(__fork_lock): Rename to atfork_lock.
	(__register_atforki, __unregister_atfork, libc_freeres_fn): Rewrite
	to use a dynamic array to add/remove atfork handlers.
	* sysdeps/nptl/fork.c (__libc_fork): Likewise.
	* sysdeps/nptl/fork.h (__fork_lock, __fork_handlers, __linkin_atfork):
	Remove declaration.
	(fork_handler): Remove next, refcntr, and need_signal member.
	(__run_fork_handler_type): New enum.
	(__run_fork_handlers): New prototype.
	* sysdeps/nptl/libc-lockP.h (__libc_atfork): Remove declaration.
---
 ChangeLog                 |  15 +++++
 nptl/Makefile             |   2 +-
 nptl/register-atfork.c    | 146 +++++++++++++++++++---------------------------
 sysdeps/nptl/fork.c       |  96 +-----------------------------
 sysdeps/nptl/fork.h       |  31 +++++-----
 sysdeps/nptl/libc-lockP.h |   2 -
 6 files changed, 97 insertions(+), 195 deletions(-)

Comments

Florian Weimer Feb. 7, 2018, 3:07 p.m. | #1
On 02/07/2018 02:09 PM, Adhemerval Zanella wrote:
> +  for (size_t i = 0; i < fork_handler_list_size (&fork_handlers); i++)
> +    if (fork_handler_list_at (&fork_handlers, i)->dso_handle == dso_handle)
> +      {
> +        fork_handler_list_remove (&fork_handlers, i);
> +        break;
> +      }

I think there can be multiple fork handlers for one dso_handle, and this 
loop only removes one of them.

Thanks,
Florian
Adhemerval Zanella Feb. 7, 2018, 5:16 p.m. | #2
On 07/02/2018 13:07, Florian Weimer wrote:
> On 02/07/2018 02:09 PM, Adhemerval Zanella wrote:
>> +  for (size_t i = 0; i < fork_handler_list_size (&fork_handlers); i++)
>> +    if (fork_handler_list_at (&fork_handlers, i)->dso_handle == dso_handle)
>> +      {
>> +        fork_handler_list_remove (&fork_handlers, i);
>> +        break;
>> +      }
> 
> I think there can be multiple fork handlers for one dso_handle, and this loop only removes one of them.

Indeed, I overlook it.  Below it is an updated patch with a missing removal
I forgo to add (unregister-atfork.c).

--

	* nptl/Makefile (routines): Remove unregister-atfork.
	* nptl/register-atfork.c (fork_handler_pool): Remove variable.
	(fork_handler_alloc): Remove function.
	(fork_handlers, fork_handler_init): New variables.
	(__fork_lock): Rename to atfork_lock.
	(__register_atforki, __unregister_atfork, libc_freeres_fn): Rewrite
	to use a dynamic array to add/remove atfork handlers.
	* sysdeps/nptl/fork.c (__libc_fork): Likewise.
	* sysdeps/nptl/fork.h (__fork_lock, __fork_handlers, __linkin_atfork):
	Remove declaration.
	(fork_handler): Remove next, refcntr, and need_signal member.
	(__run_fork_handler_type): New enum.
	(__run_fork_handlers): New prototype.
	* sysdeps/nptl/libc-lockP.h (__libc_atfork): Remove declaration.

---

diff --git a/nptl/Makefile b/nptl/Makefile
index 6fc2c8b..be7ee3e 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -30,7 +30,7 @@ install-lib-ldscripts := libpthread.so
 
 routines = alloca_cutoff forward libc-lowlevellock libc-cancellation \
 	   libc-cleanup libc_pthread_init libc_multiple_threads \
-	   register-atfork unregister-atfork pthread_self
+	   register-atfork pthread_self
 shared-only-routines = forward
 
 # We need to provide certain routines for compatibility with existing
diff --git a/nptl/register-atfork.c b/nptl/register-atfork.c
index f309cec..72169b8 100644
--- a/nptl/register-atfork.c
+++ b/nptl/register-atfork.c
@@ -22,123 +22,100 @@
 #include <fork.h>
 #include <atomic.h>
 
+#define DYNARRAY_ELEMENT           struct fork_handler
+#define DYNARRAY_STRUCT            fork_handler_list
+#define DYNARRAY_PREFIX            fork_handler_list_
+#define DYNARRAY_INITIAL_SIZE      48
+#include <malloc/dynarray-skeleton.c>
 
-struct fork_handler *__fork_handlers;
-
-/* Lock to protect allocation and deallocation of fork handlers.  */
-int __fork_lock = LLL_LOCK_INITIALIZER;
-
-
-/* Number of pre-allocated handler entries.  */
-#define NHANDLER 48
-
-/* Memory pool for fork handler structures.  */
-static struct fork_handler_pool
-{
-  struct fork_handler_pool *next;
-  struct fork_handler mem[NHANDLER];
-} fork_handler_pool;
-
-
-static struct fork_handler *
-fork_handler_alloc (void)
-{
-  struct fork_handler_pool *runp = &fork_handler_pool;
-  struct fork_handler *result = NULL;
-  unsigned int i;
-
-  do
-    {
-      /* Search for an empty entry.  */
-      for (i = 0; i < NHANDLER; ++i)
-	if (runp->mem[i].refcntr == 0)
-	  goto found;
-    }
-  while ((runp = runp->next) != NULL);
-
-  /* We have to allocate a new entry.  */
-  runp = (struct fork_handler_pool *) calloc (1, sizeof (*runp));
-  if (runp != NULL)
-    {
-      /* Enqueue the new memory pool into the list.  */
-      runp->next = fork_handler_pool.next;
-      fork_handler_pool.next = runp;
-
-      /* We use the last entry on the page.  This means when we start
-	 searching from the front the next time we will find the first
-	 entry unused.  */
-      i = NHANDLER - 1;
-
-    found:
-      result = &runp->mem[i];
-      result->refcntr = 1;
-      result->need_signal = 0;
-    }
-
-  return result;
-}
+static struct fork_handler_list fork_handlers;
+static bool fork_handler_init = false;
 
+static int atfork_lock = LLL_LOCK_INITIALIZER;
 
 int
 __register_atfork (void (*prepare) (void), void (*parent) (void),
 		   void (*child) (void), void *dso_handle)
 {
-  /* Get the lock to not conflict with other allocations.  */
-  lll_lock (__fork_lock, LLL_PRIVATE);
+  lll_lock (atfork_lock, LLL_PRIVATE);
 
-  struct fork_handler *newp = fork_handler_alloc ();
+  if (!fork_handler_init)
+    {
+      fork_handler_list_init (&fork_handlers);
+      fork_handler_init = true;
+    }
 
+  struct fork_handler *newp = fork_handler_list_emplace (&fork_handlers);
   if (newp != NULL)
     {
-      /* Initialize the new record.  */
       newp->prepare_handler = prepare;
       newp->parent_handler = parent;
       newp->child_handler = child;
       newp->dso_handle = dso_handle;
-
-      __linkin_atfork (newp);
     }
 
   /* Release the lock.  */
-  lll_unlock (__fork_lock, LLL_PRIVATE);
+  lll_unlock (atfork_lock, LLL_PRIVATE);
 
   return newp == NULL ? ENOMEM : 0;
 }
 libc_hidden_def (__register_atfork)
 
-
 void
-attribute_hidden
-__linkin_atfork (struct fork_handler *newp)
+__unregister_atfork (void *dso_handle)
 {
-  do
-    newp->next = __fork_handlers;
-  while (catomic_compare_and_exchange_bool_acq (&__fork_handlers,
-						newp, newp->next) != 0);
-}
+  lll_lock (atfork_lock, LLL_PRIVATE);
 
+  for (size_t i = 0; i < fork_handler_list_size (&fork_handlers);)
+    {
+      /* dynarray remove maintains element order, so update index iff there is
+	 no removal.  */
+      if (fork_handler_list_at (&fork_handlers, i)->dso_handle == dso_handle)
+        fork_handler_list_remove (&fork_handlers, i);
+      else
+        i++;
+    }
 
-libc_freeres_fn (free_mem)
+  lll_unlock (atfork_lock, LLL_PRIVATE);
+}
+
+void
+__run_fork_handlers (enum __run_fork_handler_type who)
 {
-  /* Get the lock to not conflict with running forks.  */
-  lll_lock (__fork_lock, LLL_PRIVATE);
+  struct fork_handler *runp;
 
-  /* No more fork handlers.  */
-  __fork_handlers = NULL;
+  if (who == atfork_run_prepare)
+    {
+      lll_lock (atfork_lock, LLL_PRIVATE);
+      size_t sl = fork_handler_list_size (&fork_handlers);
+      for (size_t i = sl; i > 0; i--)
+	{
+	  runp = fork_handler_list_at (&fork_handlers, i - 1);
+	  if (runp->prepare_handler != NULL)
+	    runp->prepare_handler ();
+	}
+    }
+  else
+    {
+      size_t sl = fork_handler_list_size (&fork_handlers);
+      for (size_t i = 0; i < sl; i++)
+	{
+	  runp = fork_handler_list_at (&fork_handlers, i);
+	  if (who == atfork_run_child && runp->child_handler)
+	    runp->child_handler ();
+	  else if (who == atfork_run_parent && runp->parent_handler)
+	    runp->parent_handler ();
+	}
+      lll_unlock (atfork_lock, LLL_PRIVATE);
+    }
+}
 
-  /* Free eventually allocated memory blocks for the object pool.  */
-  struct fork_handler_pool *runp = fork_handler_pool.next;
 
-  memset (&fork_handler_pool, '\0', sizeof (fork_handler_pool));
+libc_freeres_fn (free_mem)
+{
+  lll_lock (atfork_lock, LLL_PRIVATE);
 
-  /* Release the lock.  */
-  lll_unlock (__fork_lock, LLL_PRIVATE);
+  fork_handler_list_free (&fork_handlers);
 
-  /* We can free the memory after releasing the lock.  */
-  while (runp != NULL)
-    {
-      struct fork_handler_pool *oldp = runp;
-      runp = runp->next;
-      free (oldp);
-    }
+  lll_unlock (atfork_lock, LLL_PRIVATE);
 }
diff --git a/nptl/unregister-atfork.c b/nptl/unregister-atfork.c
deleted file mode 100644
index 20411ed..0000000
--- a/nptl/unregister-atfork.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <errno.h>
-#include <stdlib.h>
-#include <fork.h>
-#include <atomic.h>
-#include <futex-internal.h>
-
-
-void
-__unregister_atfork (void *dso_handle)
-{
-  /* Check whether there is any entry in the list which we have to
-     remove.  It is likely that this is not the case so don't bother
-     getting the lock.
-
-     We do not worry about other threads adding entries for this DSO
-     right this moment.  If this happens this is a race and we can do
-     whatever we please.  The program will crash anyway seen.  */
-  struct fork_handler *runp = __fork_handlers;
-  struct fork_handler *lastp = NULL;
-
-  while (runp != NULL)
-    if (runp->dso_handle == dso_handle)
-      break;
-    else
-      {
-	lastp = runp;
-	runp = runp->next;
-      }
-
-  if (runp == NULL)
-    /* Nothing to do.  */
-    return;
-
-  /* Get the lock to not conflict with additions or deletions.  Note
-     that there couldn't have been another thread deleting something.
-     The __unregister_atfork function is only called from the
-     dlclose() code which itself serializes the operations.  */
-  lll_lock (__fork_lock, LLL_PRIVATE);
-
-  /* We have to create a new list with all the entries we don't remove.  */
-  struct deleted_handler
-  {
-    struct fork_handler *handler;
-    struct deleted_handler *next;
-  } *deleted = NULL;
-
-  /* Remove the entries for the DSO which is unloaded from the list.
-     It's a single linked list so readers are.  */
-  do
-    {
-    again:
-      if (runp->dso_handle == dso_handle)
-	{
-	  if (lastp == NULL)
-	    {
-	      /* We have to use an atomic operation here because
-		 __linkin_atfork also uses one.  */
-	      if (catomic_compare_and_exchange_bool_acq (&__fork_handlers,
-							 runp->next, runp)
-		  != 0)
-		{
-		  runp = __fork_handlers;
-		  goto again;
-		}
-	    }
-	  else
-	    lastp->next = runp->next;
-
-	  /* We cannot overwrite the ->next element now.  Put the deleted
-	     entries in a separate list.  */
-	  struct deleted_handler *newp = alloca (sizeof (*newp));
-	  newp->handler = runp;
-	  newp->next = deleted;
-	  deleted = newp;
-	}
-      else
-	lastp = runp;
-
-      runp = runp->next;
-    }
-  while (runp != NULL);
-
-  /* Release the lock.  */
-  lll_unlock (__fork_lock, LLL_PRIVATE);
-
-  /* Walk the list of all entries which have to be deleted.  */
-  while (deleted != NULL)
-    {
-      /* We need to be informed by possible current users.  */
-      deleted->handler->need_signal = 1;
-      /* Make sure this gets written out first.  */
-      atomic_write_barrier ();
-
-      /* Decrement the reference counter.  If it does not reach zero
-	 wait for the last user.  */
-      atomic_decrement (&deleted->handler->refcntr);
-      unsigned int val;
-      while ((val = deleted->handler->refcntr) != 0)
-	futex_wait_simple (&deleted->handler->refcntr, val, FUTEX_PRIVATE);
-
-      deleted = deleted->next;
-    }
-}
diff --git a/sysdeps/nptl/fork.c b/sysdeps/nptl/fork.c
index 0061ee0..ec56a82 100644
--- a/sysdeps/nptl/fork.c
+++ b/sysdeps/nptl/fork.c
@@ -48,11 +48,6 @@ pid_t
 __libc_fork (void)
 {
   pid_t pid;
-  struct used_handler
-  {
-    struct fork_handler *handler;
-    struct used_handler *next;
-  } *allp = NULL;
 
   /* Determine if we are running multiple threads.  We skip some fork
      handlers in the single-thread case, to make fork safer to use in
@@ -60,60 +55,7 @@ __libc_fork (void)
      but our current fork implementation is not.  */
   bool multiple_threads = THREAD_GETMEM (THREAD_SELF, header.multiple_threads);
 
-  /* Run all the registered preparation handlers.  In reverse order.
-     While doing this we build up a list of all the entries.  */
-  struct fork_handler *runp;
-  while ((runp = __fork_handlers) != NULL)
-    {
-      /* Make sure we read from the current RUNP pointer.  */
-      atomic_full_barrier ();
-
-      unsigned int oldval = runp->refcntr;
-
-      if (oldval == 0)
-	/* This means some other thread removed the list just after
-	   the pointer has been loaded.  Try again.  Either the list
-	   is empty or we can retry it.  */
-	continue;
-
-      /* Bump the reference counter.  */
-      if (atomic_compare_and_exchange_bool_acq (&__fork_handlers->refcntr,
-						oldval + 1, oldval))
-	/* The value changed, try again.  */
-	continue;
-
-      /* We bumped the reference counter for the first entry in the
-	 list.  That means that none of the following entries will
-	 just go away.  The unloading code works in the order of the
-	 list.
-
-	 While executing the registered handlers we are building a
-	 list of all the entries so that we can go backward later on.  */
-      while (1)
-	{
-	  /* Execute the handler if there is one.  */
-	  if (runp->prepare_handler != NULL)
-	    runp->prepare_handler ();
-
-	  /* Create a new element for the list.  */
-	  struct used_handler *newp
-	    = (struct used_handler *) alloca (sizeof (*newp));
-	  newp->handler = runp;
-	  newp->next = allp;
-	  allp = newp;
-
-	  /* Advance to the next handler.  */
-	  runp = runp->next;
-	  if (runp == NULL)
-	    break;
-
-	  /* Bump the reference counter for the next entry.  */
-	  atomic_increment (&runp->refcntr);
-	}
-
-      /* We are done.  */
-      break;
-    }
+  __run_fork_handlers (atfork_run_prepare);
 
   /* If we are not running multiple threads, we do not have to
      preserve lock state.  If fork runs from a signal handler, only
@@ -192,29 +134,7 @@ __libc_fork (void)
       __rtld_lock_initialize (GL(dl_load_lock));
 
       /* Run the handlers registered for the child.  */
-      while (allp != NULL)
-	{
-	  if (allp->handler->child_handler != NULL)
-	    allp->handler->child_handler ();
-
-	  /* Note that we do not have to wake any possible waiter.
-	     This is the only thread in the new process.  The count
-	     may have been bumped up by other threads doing a fork.
-	     We reset it to 1, to avoid waiting for non-existing
-	     thread(s) to release the count.  */
-	  allp->handler->refcntr = 1;
-
-	  /* XXX We could at this point look through the object pool
-	     and mark all objects not on the __fork_handlers list as
-	     unused.  This is necessary in case the fork() happened
-	     while another thread called dlclose() and that call had
-	     to create a new list.  */
-
-	  allp = allp->next;
-	}
-
-      /* Initialize the fork lock.  */
-      __fork_lock = LLL_LOCK_INITIALIZER;
+      __run_fork_handlers (atfork_run_child);
     }
   else
     {
@@ -229,17 +149,7 @@ __libc_fork (void)
 	}
 
       /* Run the handlers registered for the parent.  */
-      while (allp != NULL)
-	{
-	  if (allp->handler->parent_handler != NULL)
-	    allp->handler->parent_handler ();
-
-	  if (atomic_decrement_and_test (&allp->handler->refcntr)
-	      && allp->handler->need_signal)
-	    futex_wake (&allp->handler->refcntr, 1, FUTEX_PRIVATE);
-
-	  allp = allp->next;
-	}
+      __run_fork_handlers (atfork_run_parent);
     }
 
   return pid;
diff --git a/sysdeps/nptl/fork.h b/sysdeps/nptl/fork.h
index f0330cc..6eab61c 100644
--- a/sysdeps/nptl/fork.h
+++ b/sysdeps/nptl/fork.h
@@ -24,29 +24,37 @@ extern unsigned long int __fork_generation attribute_hidden;
 /* Pointer to the fork generation counter in the thread library.  */
 extern unsigned long int *__fork_generation_pointer attribute_hidden;
 
-/* Lock to protect allocation and deallocation of fork handlers.  */
-extern int __fork_lock attribute_hidden;
-
 /* Elements of the fork handler lists.  */
 struct fork_handler
 {
-  struct fork_handler *next;
   void (*prepare_handler) (void);
   void (*parent_handler) (void);
   void (*child_handler) (void);
   void *dso_handle;
-  unsigned int refcntr;
-  int need_signal;
 };
 
-/* The single linked list of all currently registered for handlers.  */
-extern struct fork_handler *__fork_handlers attribute_hidden;
-
-
 /* Function to call to unregister fork handlers.  */
 extern void __unregister_atfork (void *dso_handle) attribute_hidden;
 #define UNREGISTER_ATFORK(dso_handle) __unregister_atfork (dso_handle)
 
+enum __run_fork_handler_type
+{
+  atfork_run_prepare,
+  atfork_run_child,
+  atfork_run_parent
+};
+
+/* Run the atfork handlers and lock/unlock the internal lock depending
+   of the WHO argument:
+
+   - atfork_run_prepare: run all the PREPARE_HANDLER in reverse order of
+			 insertion and locks the internal lock.
+   - atfork_run_child: run all the CHILD_HANDLER and unlocks the internal
+		       lock.
+   - atfork_run_parent: run all the PARENT_HANDLER and unlocks the internal
+			lock.  */
+extern void __run_fork_handlers (enum __run_fork_handler_type who)
+  attribute_hidden;
 
 /* C library side function to register new fork handlers.  */
 extern int __register_atfork (void (*__prepare) (void),
@@ -54,6 +62,3 @@ extern int __register_atfork (void (*__prepare) (void),
 			      void (*__child) (void),
 			      void *dso_handle);
 libc_hidden_proto (__register_atfork)
-
-/* Add a new element to the fork list.  */
-extern void __linkin_atfork (struct fork_handler *newp) attribute_hidden;
diff --git a/sysdeps/nptl/libc-lockP.h b/sysdeps/nptl/libc-lockP.h
index 8539bbf..989fefa 100644
--- a/sysdeps/nptl/libc-lockP.h
+++ b/sysdeps/nptl/libc-lockP.h
@@ -319,8 +319,6 @@ __libc_cleanup_routine (struct __pthread_cleanup_frame *f)
 /* Register handlers to execute before and after `fork'.  Note that the
    last parameter is NULL.  The handlers registered by the libc are
    never removed so this is OK.  */
-#define __libc_atfork(PREPARE, PARENT, CHILD) \
-  __register_atfork (PREPARE, PARENT, CHILD, NULL)
 extern int __register_atfork (void (*__prepare) (void),
 			      void (*__parent) (void),
 			      void (*__child) (void),
Florian Weimer Feb. 8, 2018, 8:32 a.m. | #3
On 02/07/2018 06:16 PM, Adhemerval Zanella wrote:
> +  for (size_t i = 0; i < fork_handler_list_size (&fork_handlers);)
> +    {
> +      /* dynarray remove maintains element order, so update index iff there is
> +	 no removal.  */
> +      if (fork_handler_list_at (&fork_handlers, i)->dso_handle == dso_handle)
> +        fork_handler_list_remove (&fork_handlers, i);
> +      else
> +        i++;
> +    }

I thought a bit more about this.  Doesn't this lead to cubic run-time as 
DSOs are unloaded (quadratic run-time locally here, multiplied by the 
outer loop for unloading the DSOs)?

I think fork_handler_list_remove is the wrong abstraction here. 
Something like std::remove_if would be better, which moves each array 
element at most once even if multiple elements are removed during the 
scan.  Writing this generically in C is probably not worth the effort, 
so perhaps open-code that here?

Thanks,
Florian
Adhemerval Zanella Feb. 8, 2018, 12:50 p.m. | #4
On 08/02/2018 06:32, Florian Weimer wrote:
> On 02/07/2018 06:16 PM, Adhemerval Zanella wrote:
>> +  for (size_t i = 0; i < fork_handler_list_size (&fork_handlers);)
>> +    {
>> +      /* dynarray remove maintains element order, so update index iff there is
>> +     no removal.  */
>> +      if (fork_handler_list_at (&fork_handlers, i)->dso_handle == dso_handle)
>> +        fork_handler_list_remove (&fork_handlers, i);
>> +      else
>> +        i++;
>> +    }
> 
> I thought a bit more about this.  Doesn't this lead to cubic run-time as DSOs are unloaded (quadratic run-time locally here, multiplied by the outer loop for unloading the DSOs)?
> 
> I think fork_handler_list_remove is the wrong abstraction here. Something like std::remove_if would be better, which moves each array element at most once even if multiple elements are removed during the scan.  Writing this generically in C is probably not worth the effort, so perhaps open-code that here?
> 
> Thanks,
> Florian

I though about it and I decided use the simplest approach mainly because I assume
the at fork handler number should not that high (current static buffer assumes a
size of 48).  Using a simple benchmark to measure the difference (measures the
time using clock_gettime to remove all elements in the list) I see:

size: 48
  remove    = 1236
  remove_if = 249
size: 1024
  remove    = 313755
  remove_if = 1017
bench: 16384
  remove    = 123934220
  remove_if = 40881

I found 1000 ns and even 30 us negligible, however I do agree if generic usage aims
for high atfork handler remove_if is indeed a better strategy.  I adjusted my patch 
to use instead (and I think we can drop the dynarray remove for now).

--

diff --git a/nptl/Makefile b/nptl/Makefile
index 6fc2c8b..be7ee3e 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -30,7 +30,7 @@ install-lib-ldscripts := libpthread.so
 
 routines = alloca_cutoff forward libc-lowlevellock libc-cancellation \
 	   libc-cleanup libc_pthread_init libc_multiple_threads \
-	   register-atfork unregister-atfork pthread_self
+	   register-atfork pthread_self
 shared-only-routines = forward
 
 # We need to provide certain routines for compatibility with existing
diff --git a/nptl/register-atfork.c b/nptl/register-atfork.c
index f309cec..5826e4c 100644
--- a/nptl/register-atfork.c
+++ b/nptl/register-atfork.c
@@ -22,123 +22,127 @@
 #include <fork.h>
 #include <atomic.h>
 
+#define DYNARRAY_ELEMENT           struct fork_handler
+#define DYNARRAY_STRUCT            fork_handler_list
+#define DYNARRAY_PREFIX            fork_handler_list_
+#define DYNARRAY_INITIAL_SIZE      48
+#include <malloc/dynarray-skeleton.c>
 
-struct fork_handler *__fork_handlers;
-
-/* Lock to protect allocation and deallocation of fork handlers.  */
-int __fork_lock = LLL_LOCK_INITIALIZER;
-
-
-/* Number of pre-allocated handler entries.  */
-#define NHANDLER 48
-
-/* Memory pool for fork handler structures.  */
-static struct fork_handler_pool
-{
-  struct fork_handler_pool *next;
-  struct fork_handler mem[NHANDLER];
-} fork_handler_pool;
-
-
-static struct fork_handler *
-fork_handler_alloc (void)
-{
-  struct fork_handler_pool *runp = &fork_handler_pool;
-  struct fork_handler *result = NULL;
-  unsigned int i;
-
-  do
-    {
-      /* Search for an empty entry.  */
-      for (i = 0; i < NHANDLER; ++i)
-	if (runp->mem[i].refcntr == 0)
-	  goto found;
-    }
-  while ((runp = runp->next) != NULL);
-
-  /* We have to allocate a new entry.  */
-  runp = (struct fork_handler_pool *) calloc (1, sizeof (*runp));
-  if (runp != NULL)
-    {
-      /* Enqueue the new memory pool into the list.  */
-      runp->next = fork_handler_pool.next;
-      fork_handler_pool.next = runp;
-
-      /* We use the last entry on the page.  This means when we start
-	 searching from the front the next time we will find the first
-	 entry unused.  */
-      i = NHANDLER - 1;
-
-    found:
-      result = &runp->mem[i];
-      result->refcntr = 1;
-      result->need_signal = 0;
-    }
-
-  return result;
-}
+static struct fork_handler_list fork_handlers;
+static bool fork_handler_init = false;
 
+static int atfork_lock = LLL_LOCK_INITIALIZER;
 
 int
 __register_atfork (void (*prepare) (void), void (*parent) (void),
 		   void (*child) (void), void *dso_handle)
 {
-  /* Get the lock to not conflict with other allocations.  */
-  lll_lock (__fork_lock, LLL_PRIVATE);
+  lll_lock (atfork_lock, LLL_PRIVATE);
 
-  struct fork_handler *newp = fork_handler_alloc ();
+  if (!fork_handler_init)
+    {
+      fork_handler_list_init (&fork_handlers);
+      fork_handler_init = true;
+    }
 
+  struct fork_handler *newp = fork_handler_list_emplace (&fork_handlers);
   if (newp != NULL)
     {
-      /* Initialize the new record.  */
       newp->prepare_handler = prepare;
       newp->parent_handler = parent;
       newp->child_handler = child;
       newp->dso_handle = dso_handle;
-
-      __linkin_atfork (newp);
     }
 
   /* Release the lock.  */
-  lll_unlock (__fork_lock, LLL_PRIVATE);
+  lll_unlock (atfork_lock, LLL_PRIVATE);
 
   return newp == NULL ? ENOMEM : 0;
 }
 libc_hidden_def (__register_atfork)
 
+static struct fork_handler *
+fork_handler_list_find_if (struct fork_handler_list *fork_handlers,
+			   void *dso_handle)
+{
+  for (size_t i = 0; i < fork_handler_list_size (fork_handlers); i++)
+    {
+      struct fork_handler *elem = fork_handler_list_at (fork_handlers, i);
+      if (elem->dso_handle == dso_handle)
+	return elem;
+    }
+  return NULL;
+}
 
 void
-attribute_hidden
-__linkin_atfork (struct fork_handler *newp)
+__unregister_atfork (void *dso_handle)
 {
-  do
-    newp->next = __fork_handlers;
-  while (catomic_compare_and_exchange_bool_acq (&__fork_handlers,
-						newp, newp->next) != 0);
-}
+  lll_lock (atfork_lock, LLL_PRIVATE);
+
+  struct fork_handler *first = fork_handler_list_find_if (&fork_handlers,
+							  dso_handle);
+  /* Removing is done by shifting the elements in the way the elements
+     that are not to be removed appear in the beginning in dynarray.
+     This avoid the quadradic run-time if a naive strategy to remove and
+     shift one element at time.  */
+  if (first != NULL)
+    {
+      struct fork_handler *result = first;
+      first++;
+      for (; first != fork_handler_list_end (&fork_handlers); ++first)
+	{
+	  if (first->dso_handle != dso_handle)
+	    {
+	      memcpy (result, first, sizeof (struct fork_handler));
+	      ++result;
+	    }
+	}
+
+      ptrdiff_t removed = first - result;
+      for (size_t i = 0; i < removed; i++)
+	fork_handler_list_remove_last (&fork_handlers);
+    }
 
+  lll_unlock (atfork_lock, LLL_PRIVATE);
+}
 
-libc_freeres_fn (free_mem)
+void
+__run_fork_handlers (enum __run_fork_handler_type who)
 {
-  /* Get the lock to not conflict with running forks.  */
-  lll_lock (__fork_lock, LLL_PRIVATE);
+  struct fork_handler *runp;
 
-  /* No more fork handlers.  */
-  __fork_handlers = NULL;
+  if (who == atfork_run_prepare)
+    {
+      lll_lock (atfork_lock, LLL_PRIVATE);
+      size_t sl = fork_handler_list_size (&fork_handlers);
+      for (size_t i = sl; i > 0; i--)
+	{
+	  runp = fork_handler_list_at (&fork_handlers, i - 1);
+	  if (runp->prepare_handler != NULL)
+	    runp->prepare_handler ();
+	}
+    }
+  else
+    {
+      size_t sl = fork_handler_list_size (&fork_handlers);
+      for (size_t i = 0; i < sl; i++)
+	{
+	  runp = fork_handler_list_at (&fork_handlers, i);
+	  if (who == atfork_run_child && runp->child_handler)
+	    runp->child_handler ();
+	  else if (who == atfork_run_parent && runp->parent_handler)
+	    runp->parent_handler ();
+	}
+      lll_unlock (atfork_lock, LLL_PRIVATE);
+    }
+}
 
-  /* Free eventually allocated memory blocks for the object pool.  */
-  struct fork_handler_pool *runp = fork_handler_pool.next;
 
-  memset (&fork_handler_pool, '\0', sizeof (fork_handler_pool));
+libc_freeres_fn (free_mem)
+{
+  lll_lock (atfork_lock, LLL_PRIVATE);
 
-  /* Release the lock.  */
-  lll_unlock (__fork_lock, LLL_PRIVATE);
+  fork_handler_list_free (&fork_handlers);
 
-  /* We can free the memory after releasing the lock.  */
-  while (runp != NULL)
-    {
-      struct fork_handler_pool *oldp = runp;
-      runp = runp->next;
-      free (oldp);
-    }
+  lll_unlock (atfork_lock, LLL_PRIVATE);
 }
diff --git a/nptl/unregister-atfork.c b/nptl/unregister-atfork.c
deleted file mode 100644
index 20411ed..0000000
--- a/nptl/unregister-atfork.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (C) 2002-2018 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <errno.h>
-#include <stdlib.h>
-#include <fork.h>
-#include <atomic.h>
-#include <futex-internal.h>
-
-
-void
-__unregister_atfork (void *dso_handle)
-{
-  /* Check whether there is any entry in the list which we have to
-     remove.  It is likely that this is not the case so don't bother
-     getting the lock.
-
-     We do not worry about other threads adding entries for this DSO
-     right this moment.  If this happens this is a race and we can do
-     whatever we please.  The program will crash anyway seen.  */
-  struct fork_handler *runp = __fork_handlers;
-  struct fork_handler *lastp = NULL;
-
-  while (runp != NULL)
-    if (runp->dso_handle == dso_handle)
-      break;
-    else
-      {
-	lastp = runp;
-	runp = runp->next;
-      }
-
-  if (runp == NULL)
-    /* Nothing to do.  */
-    return;
-
-  /* Get the lock to not conflict with additions or deletions.  Note
-     that there couldn't have been another thread deleting something.
-     The __unregister_atfork function is only called from the
-     dlclose() code which itself serializes the operations.  */
-  lll_lock (__fork_lock, LLL_PRIVATE);
-
-  /* We have to create a new list with all the entries we don't remove.  */
-  struct deleted_handler
-  {
-    struct fork_handler *handler;
-    struct deleted_handler *next;
-  } *deleted = NULL;
-
-  /* Remove the entries for the DSO which is unloaded from the list.
-     It's a single linked list so readers are.  */
-  do
-    {
-    again:
-      if (runp->dso_handle == dso_handle)
-	{
-	  if (lastp == NULL)
-	    {
-	      /* We have to use an atomic operation here because
-		 __linkin_atfork also uses one.  */
-	      if (catomic_compare_and_exchange_bool_acq (&__fork_handlers,
-							 runp->next, runp)
-		  != 0)
-		{
-		  runp = __fork_handlers;
-		  goto again;
-		}
-	    }
-	  else
-	    lastp->next = runp->next;
-
-	  /* We cannot overwrite the ->next element now.  Put the deleted
-	     entries in a separate list.  */
-	  struct deleted_handler *newp = alloca (sizeof (*newp));
-	  newp->handler = runp;
-	  newp->next = deleted;
-	  deleted = newp;
-	}
-      else
-	lastp = runp;
-
-      runp = runp->next;
-    }
-  while (runp != NULL);
-
-  /* Release the lock.  */
-  lll_unlock (__fork_lock, LLL_PRIVATE);
-
-  /* Walk the list of all entries which have to be deleted.  */
-  while (deleted != NULL)
-    {
-      /* We need to be informed by possible current users.  */
-      deleted->handler->need_signal = 1;
-      /* Make sure this gets written out first.  */
-      atomic_write_barrier ();
-
-      /* Decrement the reference counter.  If it does not reach zero
-	 wait for the last user.  */
-      atomic_decrement (&deleted->handler->refcntr);
-      unsigned int val;
-      while ((val = deleted->handler->refcntr) != 0)
-	futex_wait_simple (&deleted->handler->refcntr, val, FUTEX_PRIVATE);
-
-      deleted = deleted->next;
-    }
-}
diff --git a/sysdeps/nptl/fork.c b/sysdeps/nptl/fork.c
index 0061ee0..ec56a82 100644
--- a/sysdeps/nptl/fork.c
+++ b/sysdeps/nptl/fork.c
@@ -48,11 +48,6 @@ pid_t
 __libc_fork (void)
 {
   pid_t pid;
-  struct used_handler
-  {
-    struct fork_handler *handler;
-    struct used_handler *next;
-  } *allp = NULL;
 
   /* Determine if we are running multiple threads.  We skip some fork
      handlers in the single-thread case, to make fork safer to use in
@@ -60,60 +55,7 @@ __libc_fork (void)
      but our current fork implementation is not.  */
   bool multiple_threads = THREAD_GETMEM (THREAD_SELF, header.multiple_threads);
 
-  /* Run all the registered preparation handlers.  In reverse order.
-     While doing this we build up a list of all the entries.  */
-  struct fork_handler *runp;
-  while ((runp = __fork_handlers) != NULL)
-    {
-      /* Make sure we read from the current RUNP pointer.  */
-      atomic_full_barrier ();
-
-      unsigned int oldval = runp->refcntr;
-
-      if (oldval == 0)
-	/* This means some other thread removed the list just after
-	   the pointer has been loaded.  Try again.  Either the list
-	   is empty or we can retry it.  */
-	continue;
-
-      /* Bump the reference counter.  */
-      if (atomic_compare_and_exchange_bool_acq (&__fork_handlers->refcntr,
-						oldval + 1, oldval))
-	/* The value changed, try again.  */
-	continue;
-
-      /* We bumped the reference counter for the first entry in the
-	 list.  That means that none of the following entries will
-	 just go away.  The unloading code works in the order of the
-	 list.
-
-	 While executing the registered handlers we are building a
-	 list of all the entries so that we can go backward later on.  */
-      while (1)
-	{
-	  /* Execute the handler if there is one.  */
-	  if (runp->prepare_handler != NULL)
-	    runp->prepare_handler ();
-
-	  /* Create a new element for the list.  */
-	  struct used_handler *newp
-	    = (struct used_handler *) alloca (sizeof (*newp));
-	  newp->handler = runp;
-	  newp->next = allp;
-	  allp = newp;
-
-	  /* Advance to the next handler.  */
-	  runp = runp->next;
-	  if (runp == NULL)
-	    break;
-
-	  /* Bump the reference counter for the next entry.  */
-	  atomic_increment (&runp->refcntr);
-	}
-
-      /* We are done.  */
-      break;
-    }
+  __run_fork_handlers (atfork_run_prepare);
 
   /* If we are not running multiple threads, we do not have to
      preserve lock state.  If fork runs from a signal handler, only
@@ -192,29 +134,7 @@ __libc_fork (void)
       __rtld_lock_initialize (GL(dl_load_lock));
 
       /* Run the handlers registered for the child.  */
-      while (allp != NULL)
-	{
-	  if (allp->handler->child_handler != NULL)
-	    allp->handler->child_handler ();
-
-	  /* Note that we do not have to wake any possible waiter.
-	     This is the only thread in the new process.  The count
-	     may have been bumped up by other threads doing a fork.
-	     We reset it to 1, to avoid waiting for non-existing
-	     thread(s) to release the count.  */
-	  allp->handler->refcntr = 1;
-
-	  /* XXX We could at this point look through the object pool
-	     and mark all objects not on the __fork_handlers list as
-	     unused.  This is necessary in case the fork() happened
-	     while another thread called dlclose() and that call had
-	     to create a new list.  */
-
-	  allp = allp->next;
-	}
-
-      /* Initialize the fork lock.  */
-      __fork_lock = LLL_LOCK_INITIALIZER;
+      __run_fork_handlers (atfork_run_child);
     }
   else
     {
@@ -229,17 +149,7 @@ __libc_fork (void)
 	}
 
       /* Run the handlers registered for the parent.  */
-      while (allp != NULL)
-	{
-	  if (allp->handler->parent_handler != NULL)
-	    allp->handler->parent_handler ();
-
-	  if (atomic_decrement_and_test (&allp->handler->refcntr)
-	      && allp->handler->need_signal)
-	    futex_wake (&allp->handler->refcntr, 1, FUTEX_PRIVATE);
-
-	  allp = allp->next;
-	}
+      __run_fork_handlers (atfork_run_parent);
     }
 
   return pid;
diff --git a/sysdeps/nptl/fork.h b/sysdeps/nptl/fork.h
index f0330cc..6eab61c 100644
--- a/sysdeps/nptl/fork.h
+++ b/sysdeps/nptl/fork.h
@@ -24,29 +24,37 @@ extern unsigned long int __fork_generation attribute_hidden;
 /* Pointer to the fork generation counter in the thread library.  */
 extern unsigned long int *__fork_generation_pointer attribute_hidden;
 
-/* Lock to protect allocation and deallocation of fork handlers.  */
-extern int __fork_lock attribute_hidden;
-
 /* Elements of the fork handler lists.  */
 struct fork_handler
 {
-  struct fork_handler *next;
   void (*prepare_handler) (void);
   void (*parent_handler) (void);
   void (*child_handler) (void);
   void *dso_handle;
-  unsigned int refcntr;
-  int need_signal;
 };
 
-/* The single linked list of all currently registered for handlers.  */
-extern struct fork_handler *__fork_handlers attribute_hidden;
-
-
 /* Function to call to unregister fork handlers.  */
 extern void __unregister_atfork (void *dso_handle) attribute_hidden;
 #define UNREGISTER_ATFORK(dso_handle) __unregister_atfork (dso_handle)
 
+enum __run_fork_handler_type
+{
+  atfork_run_prepare,
+  atfork_run_child,
+  atfork_run_parent
+};
+
+/* Run the atfork handlers and lock/unlock the internal lock depending
+   of the WHO argument:
+
+   - atfork_run_prepare: run all the PREPARE_HANDLER in reverse order of
+			 insertion and locks the internal lock.
+   - atfork_run_child: run all the CHILD_HANDLER and unlocks the internal
+		       lock.
+   - atfork_run_parent: run all the PARENT_HANDLER and unlocks the internal
+			lock.  */
+extern void __run_fork_handlers (enum __run_fork_handler_type who)
+  attribute_hidden;
 
 /* C library side function to register new fork handlers.  */
 extern int __register_atfork (void (*__prepare) (void),
@@ -54,6 +62,3 @@ extern int __register_atfork (void (*__prepare) (void),
 			      void (*__child) (void),
 			      void *dso_handle);
 libc_hidden_proto (__register_atfork)
-
-/* Add a new element to the fork list.  */
-extern void __linkin_atfork (struct fork_handler *newp) attribute_hidden;
diff --git a/sysdeps/nptl/libc-lockP.h b/sysdeps/nptl/libc-lockP.h
index 8539bbf..989fefa 100644
--- a/sysdeps/nptl/libc-lockP.h
+++ b/sysdeps/nptl/libc-lockP.h
@@ -319,8 +319,6 @@ __libc_cleanup_routine (struct __pthread_cleanup_frame *f)
 /* Register handlers to execute before and after `fork'.  Note that the
    last parameter is NULL.  The handlers registered by the libc are
    never removed so this is OK.  */
-#define __libc_atfork(PREPARE, PARENT, CHILD) \
-  __register_atfork (PREPARE, PARENT, CHILD, NULL)
 extern int __register_atfork (void (*__prepare) (void),
 			      void (*__parent) (void),
 			      void (*__child) (void),
Florian Weimer Feb. 20, 2018, 11:29 a.m. | #5
On 02/08/2018 01:50 PM, Adhemerval Zanella wrote:
> +static struct fork_handler *
> +fork_handler_list_find_if (struct fork_handler_list *fork_handlers,
> +			   void *dso_handle)

Should be called _find, not find_if (no callback is involved).

> +  struct fork_handler *first = fork_handler_list_find_if (&fork_handlers,
> +							  dso_handle);
> +  /* Removing is done by shifting the elements in the way the elements
> +     that are not to be removed appear in the beginning in dynarray.
> +     This avoid the quadradic run-time if a naive strategy to remove and
> +     shift one element at time.  */
> +  if (first != NULL)
> +    {
> +      struct fork_handler *result = first;

result should probably be called new_end or something like that.

> +      first++;
> +      for (; first != fork_handler_list_end (&fork_handlers); ++first)
> +	{
> +	  if (first->dso_handle != dso_handle)
> +	    {
> +	      memcpy (result, first, sizeof (struct fork_handler));

Wouldn't a simple struct assignment work here?

I think this patch is a step in the right direction, so it should go in 
with these changes.

However, I think we should make a few improvements in follow-up fixes:

Reduce RSS usage for the common case that no atfork handlers are ever 
registered.  This will be the case once we remove the bogus 
__reclaim_stacks function.

Make a temporary copy of the handler array during fork.  This has two 
benefits: We can run the handlers without acquiring the handler lock (to 
avoid application deadlocks).  We also make sure that a handler does not 
run in a child process which did not run in the parent process.  I think 
the old implementation had both properties.

Thanks,
Florian
Adhemerval Zanella Feb. 20, 2018, 1 p.m. | #6
On 20/02/2018 08:29, Florian Weimer wrote:
> On 02/08/2018 01:50 PM, Adhemerval Zanella wrote:
>> +static struct fork_handler *
>> +fork_handler_list_find_if (struct fork_handler_list *fork_handlers,
>> +               void *dso_handle)
> 
> Should be called _find, not find_if (no callback is involved).

Fixed.

> 
>> +  struct fork_handler *first = fork_handler_list_find_if (&fork_handlers,
>> +                              dso_handle);
>> +  /* Removing is done by shifting the elements in the way the elements
>> +     that are not to be removed appear in the beginning in dynarray.
>> +     This avoid the quadradic run-time if a naive strategy to remove and
>> +     shift one element at time.  */
>> +  if (first != NULL)
>> +    {
>> +      struct fork_handler *result = first;
> 
> result should probably be called new_end or something like that.

I changed to new_end.

> 
>> +      first++;
>> +      for (; first != fork_handler_list_end (&fork_handlers); ++first)
>> +    {
>> +      if (first->dso_handle != dso_handle)
>> +        {
>> +          memcpy (result, first, sizeof (struct fork_handler));
> 
> Wouldn't a simple struct assignment work here?

I think so, I changed it to struct assignment.

> 
> I think this patch is a step in the right direction, so it should go in with these changes.

Thanks for the review.

> 
> However, I think we should make a few improvements in follow-up fixes:
> 
> Reduce RSS usage for the common case that no atfork handlers are ever registered.  This will be the case once we remove the bogus __reclaim_stacks function.
> 
> Make a temporary copy of the handler array during fork.  This has two benefits: We can run the handlers without acquiring the handler lock (to avoid application deadlocks).  We also make sure that a handler does not run in a child process which did not run in the parent process.  I think the old implementation had both properties.

The temporary copy is problematic because we either need to allocate on the stack using
vla/alloca (current practice and prone of stack overflow) or by malloc (which requires
locking anyway).  Also, to temporary copy we will need pretty much the same lock-free
algorithm which adds code complexity.

My understanding is current algorithm tries hard to remove any locking on fork generation
mainly because back then posix_spawn was no specified and suboptimal. Now that we have
a faster way to spawn process in multithread environment I think there is no much gain
in trying to optimizing locking in atfork handlers.

Regarding the handler running in child process the proposed implementation does implement
it.
Florian Weimer Feb. 20, 2018, 1:05 p.m. | #7
On 02/20/2018 02:00 PM, Adhemerval Zanella wrote:
> The temporary copy is problematic because we either need to allocate on the stack using
> vla/alloca (current practice and prone of stack overflow) or by malloc (which requires
> locking anyway).  Also, to temporary copy we will need pretty much the same lock-free
> algorithm which adds code complexity.

I think the lock in malloc is fine, at least for the time being, with 
our non-async-safe fork.

The point is not avoiding the lock, but callbacks when the lock is held. 
  This can easily result in deadlocks.

> My understanding is current algorithm tries hard to remove any locking on fork generation
> mainly because back then posix_spawn was no specified and suboptimal. Now that we have
> a faster way to spawn process in multithread environment I think there is no much gain
> in trying to optimizing locking in atfork handlers.

I think it's also needed to avoid deadlocks .

> Regarding the handler running in child process the proposed implementation does implement
 > it.

I don't see how?  I meant that only those handlers run that ran in the 
parent.  I think there's a window where more fork handlers can be added.

Thanks,
Florian
Adhemerval Zanella Feb. 20, 2018, 1:27 p.m. | #8
On 20/02/2018 10:05, Florian Weimer wrote:
> On 02/20/2018 02:00 PM, Adhemerval Zanella wrote:
>> The temporary copy is problematic because we either need to allocate on the stack using
>> vla/alloca (current practice and prone of stack overflow) or by malloc (which requires
>> locking anyway).  Also, to temporary copy we will need pretty much the same lock-free
>> algorithm which adds code complexity.
> 
> I think the lock in malloc is fine, at least for the time being, with our non-async-safe fork.
> 
> The point is not avoiding the lock, but callbacks when the lock is held.  This can easily result in deadlocks.

I think it might occur with proposed implementation only if a callback tries to call
pthread_atfork or fork itself.  It these scenario you have in mind? And should we
really support them if this is the case?


> 
>> My understanding is current algorithm tries hard to remove any locking on fork generation
>> mainly because back then posix_spawn was no specified and suboptimal. Now that we have
>> a faster way to spawn process in multithread environment I think there is no much gain
>> in trying to optimizing locking in atfork handlers.
> 
> I think it's also needed to avoid deadlocks .
> 
>> Regarding the handler running in child process the proposed implementation does implement
>> it.
> 
> I don't see how?  I meant that only those handlers run that ran in the parent.  I think there's a window where more fork handlers can be added.
>
Florian Weimer Feb. 20, 2018, 1:42 p.m. | #9
On 02/20/2018 02:27 PM, Adhemerval Zanella wrote:
> I think it might occur with proposed implementation only if a callback tries to call
> pthread_atfork or fork itself.  It these scenario you have in mind? And should we
> really support them if this is the case?

No.

__libc_fork starts like this:

   bool multiple_threads
     = THREAD_GETMEM (THREAD_SELF, header.multiple_threads);

   __run_fork_handlers (atfork_run_prepare);

And then acquires _IO_list_lock.

I don't see anything which prevents concurrent registration of 
additional fork handlers between the first and second call to 
__run_fork_handlers.

As I said, that shouldn't prevent inclusion of the current patch, but we 
need to fix this before 2.28, I think.

Thanks,
Florian
Adhemerval Zanella Feb. 20, 2018, 1:48 p.m. | #10
On 20/02/2018 10:42, Florian Weimer wrote:
> On 02/20/2018 02:27 PM, Adhemerval Zanella wrote:
>> I think it might occur with proposed implementation only if a callback tries to call
>> pthread_atfork or fork itself.  It these scenario you have in mind? And should we
>> really support them if this is the case?
> 
> No.
> 
> __libc_fork starts like this:
> 
>   bool multiple_threads
>     = THREAD_GETMEM (THREAD_SELF, header.multiple_threads);
> 
>   __run_fork_handlers (atfork_run_prepare);
> 
> And then acquires _IO_list_lock.
> 
> I don't see anything which prevents concurrent registration of additional fork handlers between the first and second call to __run_fork_handlers.

The atfork_run_prepare will instruct __run_fork_handlers to take the internal
atfork_lock handler:

  void
  __run_fork_handlers (enum __run_fork_handler_type who)
  {
    struct fork_handler *runp;

    if (who == atfork_run_prepare)
      {
        lll_lock (atfork_lock, LLL_PRIVATE);

And it will prevent to add new registration until either the parent or the child
call __run_fork_handlers with either 'atfork_run_child' or 'atfork_run_parent'
to release the lock.

> 
> As I said, that shouldn't prevent inclusion of the current patch, but we need to fix this before 2.28, I think.
> 
> Thanks,
> Florian
Florian Weimer Feb. 20, 2018, 1:58 p.m. | #11
On 02/20/2018 02:48 PM, Adhemerval Zanella wrote:

> The atfork_run_prepare will instruct __run_fork_handlers to take the internal
> atfork_lock handler:
> 
>    void
>    __run_fork_handlers (enum __run_fork_handler_type who)
>    {
>      struct fork_handler *runp;
> 
>      if (who == atfork_run_prepare)
>        {
>          lll_lock (atfork_lock, LLL_PRIVATE);
> 
> And it will prevent to add new registration until either the parent or the child
> call __run_fork_handlers with either 'atfork_run_child' or 'atfork_run_parent'
> to release the lock.

Oh, sorry, I missed that.  So the patch does not have this problem. 
This does not settle the deadlock issue, though.

Thanks,
Florian
Adhemerval Zanella Feb. 20, 2018, 2:23 p.m. | #12
On 20/02/2018 10:58, Florian Weimer wrote:
> On 02/20/2018 02:48 PM, Adhemerval Zanella wrote:
> 
>> The atfork_run_prepare will instruct __run_fork_handlers to take the internal
>> atfork_lock handler:
>>
>>    void
>>    __run_fork_handlers (enum __run_fork_handler_type who)
>>    {
>>      struct fork_handler *runp;
>>
>>      if (who == atfork_run_prepare)
>>        {
>>          lll_lock (atfork_lock, LLL_PRIVATE);
>>
>> And it will prevent to add new registration until either the parent or the child
>> call __run_fork_handlers with either 'atfork_run_child' or 'atfork_run_parent'
>> to release the lock.
> 
> Oh, sorry, I missed that.  So the patch does not have this problem. This does not settle the deadlock issue, though.

Aside of the two scenarios (callbacks issuing fork/pthread_atfork), the only
other scenario I see which might trigger a deadlock in this case is a signal
handler issuing fork/pthread_atfork.  

Former is BZ#4737 and my understanding is this should be a EWONTFIX due 
indication future POSIX specification to interpret fork as async-signal-unsafe 
(comment #19 and I am not sure if fork could be made async-signal-safe with 
ticket locks as Rich stated in comment #21).  

Regarding later I think pthread_atfork is inherent async-signal-unsafe due
it might return ENOMEM indicating it might allocate memory and our malloc
is also async-signal-unsafe.

Am I missing a scenario you might be considering?
Florian Weimer Feb. 23, 2018, 10:41 a.m. | #13
On 02/20/2018 03:23 PM, Adhemerval Zanella wrote:

> Aside of the two scenarios (callbacks issuing fork/pthread_atfork), the only
> other scenario I see which might trigger a deadlock in this case is a signal
> handler issuing fork/pthread_atfork.
> 
> Former is BZ#4737 and my understanding is this should be a EWONTFIX due
> indication future POSIX specification to interpret fork as async-signal-unsafe
> (comment #19 and I am not sure if fork could be made async-signal-safe with
> ticket locks as Rich stated in comment #21).
> 
> Regarding later I think pthread_atfork is inherent async-signal-unsafe due
> it might return ENOMEM indicating it might allocate memory and our malloc
> is also async-signal-unsafe.
> 
> Am I missing a scenario you might be considering?

I looked at the acquired locks during fork, and you are right, the 
corner cases where a deadlock can happen in the upstream sources are 
quite obscure.  However, we do not currently acquire any ld.so locks, 
and I think I've seen patches which change that (because upstream is 
buggy and crash in the new child process).  If any ld.so locks are 
acquired around fork, then we have a lock ordering conflict in case an 
ELF constructor calls pthread_register_atfork (which is an extremely 
natural thing to do), like this:

Fork:

   pthread_register_atfork lock
     rtld load lock

dlopen:

   rtld load lock
     calling ELF constructors, and then:
       pthread_register_atfork lock

The older lock-free code avoids this.  You could do the same even with 
locks if you created a copy of the handler list on the heap.

Thanks,
Florian
Adhemerval Zanella Feb. 23, 2018, 12:10 p.m. | #14
On 23/02/2018 07:41, Florian Weimer wrote:
> On 02/20/2018 03:23 PM, Adhemerval Zanella wrote:
> 
>> Aside of the two scenarios (callbacks issuing fork/pthread_atfork), the only
>> other scenario I see which might trigger a deadlock in this case is a signal
>> handler issuing fork/pthread_atfork.
>>
>> Former is BZ#4737 and my understanding is this should be a EWONTFIX due
>> indication future POSIX specification to interpret fork as async-signal-unsafe
>> (comment #19 and I am not sure if fork could be made async-signal-safe with
>> ticket locks as Rich stated in comment #21).
>>
>> Regarding later I think pthread_atfork is inherent async-signal-unsafe due
>> it might return ENOMEM indicating it might allocate memory and our malloc
>> is also async-signal-unsafe.
>>
>> Am I missing a scenario you might be considering?
> 
> I looked at the acquired locks during fork, and you are right, the corner cases where a deadlock can happen in the upstream sources are quite obscure.  However, we do not currently acquire any ld.so locks, and I think I've seen patches which change that (because upstream is buggy and crash in the new child process).  If any ld.so locks are acquired around fork, then we have a lock ordering conflict in case an ELF constructor calls pthread_register_atfork (which is an extremely natural thing to do), like this:
> 
> Fork:
> 
>   pthread_register_atfork lock
>     rtld load lock
> 
> dlopen:
> 
>   rtld load lock
>     calling ELF constructors, and then:
>       pthread_register_atfork lock
> 
> The older lock-free code avoids this.  You could do the same even with locks if you created a copy of the handler list on the heap.

MY understanding is ld.so locks might be acquired in the callback calls from
__run_fork_handlers:

  fork:
    __run_fork_handlers (atfork_run_prepare)
      lll_lock (atfork_lock)
      <callback>
         rtld load lock

However I do not see who in a different thread dlopen would acquire the same 
lock since it has been already acquired by the callback.  The only way is if 
dlopen is being called by a signal handler, which I think it another obscure 
corner case.
Florian Weimer Feb. 27, 2018, 8:25 a.m. | #15
On 02/23/2018 01:10 PM, Adhemerval Zanella wrote:
> MY understanding is ld.so locks might be acquired in the callback calls from
> __run_fork_handlers:
> 
>    fork:
>      __run_fork_handlers (atfork_run_prepare)
>        lll_lock (atfork_lock)
>        <callback>
>           rtld load lock

Yes, that could happen even with the existing code.

My concern was with certain downstream patches in some distributions 
which acquire the rtld lock around fork, to avoid potentially corrupting 
the dynamic linker state in the child process (because the fork can no 
longer race with rtld data structure updates).

> However I do not see who in a different thread dlopen would acquire the same
> lock since it has been already acquired by the callback.  The only way is if
> dlopen is being called by a signal handler, which I think it another obscure
> corner case.

I meant that one thread would acquire the rtld lock first, and another 
thread would attempt to acquire the atfork lock, and then they proceed 
to acquire the opposite lock, which will deadlock.

Thanks,
Florian

Patch

diff --git a/nptl/Makefile b/nptl/Makefile
index 6fc2c8b..be7ee3e 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -30,7 +30,7 @@  install-lib-ldscripts := libpthread.so
 
 routines = alloca_cutoff forward libc-lowlevellock libc-cancellation \
 	   libc-cleanup libc_pthread_init libc_multiple_threads \
-	   register-atfork unregister-atfork pthread_self
+	   register-atfork pthread_self
 shared-only-routines = forward
 
 # We need to provide certain routines for compatibility with existing
diff --git a/nptl/register-atfork.c b/nptl/register-atfork.c
index f309cec..0bc2fe9 100644
--- a/nptl/register-atfork.c
+++ b/nptl/register-atfork.c
@@ -22,123 +22,97 @@ 
 #include <fork.h>
 #include <atomic.h>
 
+#define DYNARRAY_ELEMENT           struct fork_handler
+#define DYNARRAY_STRUCT            fork_handler_list
+#define DYNARRAY_PREFIX            fork_handler_list_
+#define DYNARRAY_INITIAL_SIZE      48
+#include <malloc/dynarray-skeleton.c>
 
-struct fork_handler *__fork_handlers;
-
-/* Lock to protect allocation and deallocation of fork handlers.  */
-int __fork_lock = LLL_LOCK_INITIALIZER;
-
-
-/* Number of pre-allocated handler entries.  */
-#define NHANDLER 48
-
-/* Memory pool for fork handler structures.  */
-static struct fork_handler_pool
-{
-  struct fork_handler_pool *next;
-  struct fork_handler mem[NHANDLER];
-} fork_handler_pool;
-
-
-static struct fork_handler *
-fork_handler_alloc (void)
-{
-  struct fork_handler_pool *runp = &fork_handler_pool;
-  struct fork_handler *result = NULL;
-  unsigned int i;
-
-  do
-    {
-      /* Search for an empty entry.  */
-      for (i = 0; i < NHANDLER; ++i)
-	if (runp->mem[i].refcntr == 0)
-	  goto found;
-    }
-  while ((runp = runp->next) != NULL);
-
-  /* We have to allocate a new entry.  */
-  runp = (struct fork_handler_pool *) calloc (1, sizeof (*runp));
-  if (runp != NULL)
-    {
-      /* Enqueue the new memory pool into the list.  */
-      runp->next = fork_handler_pool.next;
-      fork_handler_pool.next = runp;
-
-      /* We use the last entry on the page.  This means when we start
-	 searching from the front the next time we will find the first
-	 entry unused.  */
-      i = NHANDLER - 1;
-
-    found:
-      result = &runp->mem[i];
-      result->refcntr = 1;
-      result->need_signal = 0;
-    }
-
-  return result;
-}
+static struct fork_handler_list fork_handlers;
+static bool fork_handler_init = false;
 
+static int atfork_lock = LLL_LOCK_INITIALIZER;
 
 int
 __register_atfork (void (*prepare) (void), void (*parent) (void),
 		   void (*child) (void), void *dso_handle)
 {
-  /* Get the lock to not conflict with other allocations.  */
-  lll_lock (__fork_lock, LLL_PRIVATE);
+  lll_lock (atfork_lock, LLL_PRIVATE);
 
-  struct fork_handler *newp = fork_handler_alloc ();
+  if (!fork_handler_init)
+    {
+      fork_handler_list_init (&fork_handlers);
+      fork_handler_init = true;
+    }
 
+  struct fork_handler *newp = fork_handler_list_emplace (&fork_handlers);
   if (newp != NULL)
     {
-      /* Initialize the new record.  */
       newp->prepare_handler = prepare;
       newp->parent_handler = parent;
       newp->child_handler = child;
       newp->dso_handle = dso_handle;
-
-      __linkin_atfork (newp);
     }
 
   /* Release the lock.  */
-  lll_unlock (__fork_lock, LLL_PRIVATE);
+  lll_unlock (atfork_lock, LLL_PRIVATE);
 
   return newp == NULL ? ENOMEM : 0;
 }
 libc_hidden_def (__register_atfork)
 
-
 void
-attribute_hidden
-__linkin_atfork (struct fork_handler *newp)
+__unregister_atfork (void *dso_handle)
 {
-  do
-    newp->next = __fork_handlers;
-  while (catomic_compare_and_exchange_bool_acq (&__fork_handlers,
-						newp, newp->next) != 0);
-}
+  lll_lock (atfork_lock, LLL_PRIVATE);
 
+  for (size_t i = 0; i < fork_handler_list_size (&fork_handlers); i++)
+    if (fork_handler_list_at (&fork_handlers, i)->dso_handle == dso_handle)
+      {
+        fork_handler_list_remove (&fork_handlers, i);
+        break;
+      }
 
-libc_freeres_fn (free_mem)
+  lll_unlock (atfork_lock, LLL_PRIVATE);
+}
+
+void
+__run_fork_handlers (enum __run_fork_handler_type who)
 {
-  /* Get the lock to not conflict with running forks.  */
-  lll_lock (__fork_lock, LLL_PRIVATE);
+  struct fork_handler *runp;
 
-  /* No more fork handlers.  */
-  __fork_handlers = NULL;
+  if (who == atfork_run_prepare)
+    {
+      lll_lock (atfork_lock, LLL_PRIVATE);
+      size_t sl = fork_handler_list_size (&fork_handlers);
+      for (size_t i = sl; i > 0; i--)
+	{
+	  runp = fork_handler_list_at (&fork_handlers, i - 1);
+	  if (runp->prepare_handler != NULL)
+	    runp->prepare_handler ();
+	}
+    }
+  else
+    {
+      size_t sl = fork_handler_list_size (&fork_handlers);
+      for (size_t i = 0; i < sl; i++)
+	{
+	  runp = fork_handler_list_at (&fork_handlers, i);
+	  if (who == atfork_run_child && runp->child_handler)
+	    runp->child_handler ();
+	  else if (who == atfork_run_parent && runp->parent_handler)
+	    runp->parent_handler ();
+	}
+      lll_unlock (atfork_lock, LLL_PRIVATE);
+    }
+}
 
-  /* Free eventually allocated memory blocks for the object pool.  */
-  struct fork_handler_pool *runp = fork_handler_pool.next;
 
-  memset (&fork_handler_pool, '\0', sizeof (fork_handler_pool));
+libc_freeres_fn (free_mem)
+{
+  lll_lock (atfork_lock, LLL_PRIVATE);
 
-  /* Release the lock.  */
-  lll_unlock (__fork_lock, LLL_PRIVATE);
+  fork_handler_list_free (&fork_handlers);
 
-  /* We can free the memory after releasing the lock.  */
-  while (runp != NULL)
-    {
-      struct fork_handler_pool *oldp = runp;
-      runp = runp->next;
-      free (oldp);
-    }
+  lll_unlock (atfork_lock, LLL_PRIVATE);
 }
diff --git a/sysdeps/nptl/fork.c b/sysdeps/nptl/fork.c
index 0061ee0..ec56a82 100644
--- a/sysdeps/nptl/fork.c
+++ b/sysdeps/nptl/fork.c
@@ -48,11 +48,6 @@  pid_t
 __libc_fork (void)
 {
   pid_t pid;
-  struct used_handler
-  {
-    struct fork_handler *handler;
-    struct used_handler *next;
-  } *allp = NULL;
 
   /* Determine if we are running multiple threads.  We skip some fork
      handlers in the single-thread case, to make fork safer to use in
@@ -60,60 +55,7 @@  __libc_fork (void)
      but our current fork implementation is not.  */
   bool multiple_threads = THREAD_GETMEM (THREAD_SELF, header.multiple_threads);
 
-  /* Run all the registered preparation handlers.  In reverse order.
-     While doing this we build up a list of all the entries.  */
-  struct fork_handler *runp;
-  while ((runp = __fork_handlers) != NULL)
-    {
-      /* Make sure we read from the current RUNP pointer.  */
-      atomic_full_barrier ();
-
-      unsigned int oldval = runp->refcntr;
-
-      if (oldval == 0)
-	/* This means some other thread removed the list just after
-	   the pointer has been loaded.  Try again.  Either the list
-	   is empty or we can retry it.  */
-	continue;
-
-      /* Bump the reference counter.  */
-      if (atomic_compare_and_exchange_bool_acq (&__fork_handlers->refcntr,
-						oldval + 1, oldval))
-	/* The value changed, try again.  */
-	continue;
-
-      /* We bumped the reference counter for the first entry in the
-	 list.  That means that none of the following entries will
-	 just go away.  The unloading code works in the order of the
-	 list.
-
-	 While executing the registered handlers we are building a
-	 list of all the entries so that we can go backward later on.  */
-      while (1)
-	{
-	  /* Execute the handler if there is one.  */
-	  if (runp->prepare_handler != NULL)
-	    runp->prepare_handler ();
-
-	  /* Create a new element for the list.  */
-	  struct used_handler *newp
-	    = (struct used_handler *) alloca (sizeof (*newp));
-	  newp->handler = runp;
-	  newp->next = allp;
-	  allp = newp;
-
-	  /* Advance to the next handler.  */
-	  runp = runp->next;
-	  if (runp == NULL)
-	    break;
-
-	  /* Bump the reference counter for the next entry.  */
-	  atomic_increment (&runp->refcntr);
-	}
-
-      /* We are done.  */
-      break;
-    }
+  __run_fork_handlers (atfork_run_prepare);
 
   /* If we are not running multiple threads, we do not have to
      preserve lock state.  If fork runs from a signal handler, only
@@ -192,29 +134,7 @@  __libc_fork (void)
       __rtld_lock_initialize (GL(dl_load_lock));
 
       /* Run the handlers registered for the child.  */
-      while (allp != NULL)
-	{
-	  if (allp->handler->child_handler != NULL)
-	    allp->handler->child_handler ();
-
-	  /* Note that we do not have to wake any possible waiter.
-	     This is the only thread in the new process.  The count
-	     may have been bumped up by other threads doing a fork.
-	     We reset it to 1, to avoid waiting for non-existing
-	     thread(s) to release the count.  */
-	  allp->handler->refcntr = 1;
-
-	  /* XXX We could at this point look through the object pool
-	     and mark all objects not on the __fork_handlers list as
-	     unused.  This is necessary in case the fork() happened
-	     while another thread called dlclose() and that call had
-	     to create a new list.  */
-
-	  allp = allp->next;
-	}
-
-      /* Initialize the fork lock.  */
-      __fork_lock = LLL_LOCK_INITIALIZER;
+      __run_fork_handlers (atfork_run_child);
     }
   else
     {
@@ -229,17 +149,7 @@  __libc_fork (void)
 	}
 
       /* Run the handlers registered for the parent.  */
-      while (allp != NULL)
-	{
-	  if (allp->handler->parent_handler != NULL)
-	    allp->handler->parent_handler ();
-
-	  if (atomic_decrement_and_test (&allp->handler->refcntr)
-	      && allp->handler->need_signal)
-	    futex_wake (&allp->handler->refcntr, 1, FUTEX_PRIVATE);
-
-	  allp = allp->next;
-	}
+      __run_fork_handlers (atfork_run_parent);
     }
 
   return pid;
diff --git a/sysdeps/nptl/fork.h b/sysdeps/nptl/fork.h
index f0330cc..6eab61c 100644
--- a/sysdeps/nptl/fork.h
+++ b/sysdeps/nptl/fork.h
@@ -24,29 +24,37 @@  extern unsigned long int __fork_generation attribute_hidden;
 /* Pointer to the fork generation counter in the thread library.  */
 extern unsigned long int *__fork_generation_pointer attribute_hidden;
 
-/* Lock to protect allocation and deallocation of fork handlers.  */
-extern int __fork_lock attribute_hidden;
-
 /* Elements of the fork handler lists.  */
 struct fork_handler
 {
-  struct fork_handler *next;
   void (*prepare_handler) (void);
   void (*parent_handler) (void);
   void (*child_handler) (void);
   void *dso_handle;
-  unsigned int refcntr;
-  int need_signal;
 };
 
-/* The single linked list of all currently registered for handlers.  */
-extern struct fork_handler *__fork_handlers attribute_hidden;
-
-
 /* Function to call to unregister fork handlers.  */
 extern void __unregister_atfork (void *dso_handle) attribute_hidden;
 #define UNREGISTER_ATFORK(dso_handle) __unregister_atfork (dso_handle)
 
+enum __run_fork_handler_type
+{
+  atfork_run_prepare,
+  atfork_run_child,
+  atfork_run_parent
+};
+
+/* Run the atfork handlers and lock/unlock the internal lock depending
+   of the WHO argument:
+
+   - atfork_run_prepare: run all the PREPARE_HANDLER in reverse order of
+			 insertion and locks the internal lock.
+   - atfork_run_child: run all the CHILD_HANDLER and unlocks the internal
+		       lock.
+   - atfork_run_parent: run all the PARENT_HANDLER and unlocks the internal
+			lock.  */
+extern void __run_fork_handlers (enum __run_fork_handler_type who)
+  attribute_hidden;
 
 /* C library side function to register new fork handlers.  */
 extern int __register_atfork (void (*__prepare) (void),
@@ -54,6 +62,3 @@  extern int __register_atfork (void (*__prepare) (void),
 			      void (*__child) (void),
 			      void *dso_handle);
 libc_hidden_proto (__register_atfork)
-
-/* Add a new element to the fork list.  */
-extern void __linkin_atfork (struct fork_handler *newp) attribute_hidden;
diff --git a/sysdeps/nptl/libc-lockP.h b/sysdeps/nptl/libc-lockP.h
index 8539bbf..989fefa 100644
--- a/sysdeps/nptl/libc-lockP.h
+++ b/sysdeps/nptl/libc-lockP.h
@@ -319,8 +319,6 @@  __libc_cleanup_routine (struct __pthread_cleanup_frame *f)
 /* Register handlers to execute before and after `fork'.  Note that the
    last parameter is NULL.  The handlers registered by the libc are
    never removed so this is OK.  */
-#define __libc_atfork(PREPARE, PARENT, CHILD) \
-  __register_atfork (PREPARE, PARENT, CHILD, NULL)
 extern int __register_atfork (void (*__prepare) (void),
 			      void (*__parent) (void),
 			      void (*__child) (void),