diff mbox series

linux: Fallback to fork and exec for clone fail on posix_spawn (BZ#29115)

Message ID 20220502180605.1510951-1-adhemerval.zanella@linaro.org
State New
Headers show
Series linux: Fallback to fork and exec for clone fail on posix_spawn (BZ#29115) | expand

Commit Message

Adhemerval Zanella Netto May 2, 2022, 6:06 p.m. UTC
Linux clone with CLONE_VM | CLONE_VFORK may fail for some namespace
restriction (for instance kernel does not allow processes in different
time namespaces to share the sameaddress space).

In this case clone fails with EINVAL and thus posix_spawn can not spawn
a new process.  However the same process can be spawned with fork and exec.

The patch fixes by issues _Fork and exec iff clone fails with a non
transient failure (ENOMEM and EAGAIN still returns failure to caller).

Failure on prepare phase in helper process does not trigger the fork
and exec fallback.

Checked on x86_64-linux-gnu.
---
 sysdeps/unix/sysv/linux/spawni.c | 175 ++++++++++++++++++++-----------
 1 file changed, 116 insertions(+), 59 deletions(-)

Comments

Alexey Izbyshev May 2, 2022, 7:15 p.m. UTC | #1
On 2022-05-02 21:06, Adhemerval Zanella wrote:
> Linux clone with CLONE_VM | CLONE_VFORK may fail for some namespace
> restriction (for instance kernel does not allow processes in different
> time namespaces to share the sameaddress space).
> 
> In this case clone fails with EINVAL and thus posix_spawn can not spawn
> a new process.  However the same process can be spawned with fork and 
> exec.
> 
> The patch fixes by issues _Fork and exec iff clone fails with a non
> transient failure (ENOMEM and EAGAIN still returns failure to caller).
> 
> Failure on prepare phase in helper process does not trigger the fork
> and exec fallback.
> 
> Checked on x86_64-linux-gnu.
> ---
>  sysdeps/unix/sysv/linux/spawni.c | 175 ++++++++++++++++++++-----------
>  1 file changed, 116 insertions(+), 59 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/spawni.c 
> b/sysdeps/unix/sysv/linux/spawni.c
> index d6f5ca89cd..b73715da7e 100644
> --- a/sysdeps/unix/sysv/linux/spawni.c
> +++ b/sysdeps/unix/sysv/linux/spawni.c
> @@ -67,6 +67,7 @@ struct posix_spawn_args
>    char *const *envp;
>    int xflags;
>    int err;
> +  int pipe[2];
>  };
> 
>  /* Older version requires that shell script without shebang definition
> @@ -103,6 +104,7 @@ __spawni_child (void *arguments)
>    struct posix_spawn_args *args = arguments;
>    const posix_spawnattr_t *restrict attr = args->attr;
>    const posix_spawn_file_actions_t *file_actions = args->fa;
> +  bool use_pipe = args->pipe[0] != -1;
> 
>    /* The child must ensure that no signal handler are enabled because 
> it shared
>       memory with parent, so the signal disposition must be either 
> SIG_DFL or
> @@ -113,6 +115,9 @@ __spawni_child (void *arguments)
>    struct sigaction sa;
>    memset (&sa, '\0', sizeof (sa));
> 
> +  if (use_pipe)
> +    __close (args->pipe[0]);
> +
>    sigset_t hset;
>    __sigprocmask (SIG_BLOCK, 0, &hset);
>    for (int sig = 1; sig < _NSIG; ++sig)
> @@ -300,45 +305,26 @@ fail:
>       (EINTERNALBUG) describing that, use ECHILD.  Another option would
>       be to set args->err to some negative sentinel and have the parent
>       abort(), but that seems needlessly harsh.  */
> -  args->err = errno ? : ECHILD;
> +  int ret = errno ? : ECHILD;
> +  if (use_pipe)
> +    {
> +      __close (args->pipe[0]);

The read end has already been closed above.

> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 
> 0);

If the parent is killed, write() will fail with EPIPE, so this loop will 
become infinite. Maybe check for EPIPE and break?

> +    }
> +  else
> +    args->err = ret;
> +
>    _exit (SPAWN_ERROR);
>  }
> 
> -/* Spawn a new process executing PATH with the attributes describes in 
> *ATTRP.
> -   Before running the process perform the actions described in 
> FILE-ACTIONS. */
> -static int
> -__spawnix (pid_t * pid, const char *file,
> -	   const posix_spawn_file_actions_t * file_actions,
> -	   const posix_spawnattr_t * attrp, char *const argv[],
> -	   char *const envp[], int xflags,
> -	   int (*exec) (const char *, char *const *, char *const *))
> +/* Create a new process using clone with CLONE_VM | CLONE_VFORK to 
> optimize
> +   memory usage.  Return TRUE is the helper process could be spawned, 
> or
> +   FALSE otherwise.  */
> +static bool
> +spawni_clone (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
>  {
> -  pid_t new_pid;
> -  struct posix_spawn_args args;
> -  int ec;
> -
> -  /* To avoid imposing hard limits on posix_spawn{p} the total number 
> of
> -     arguments is first calculated to allocate a mmap to hold all 
> possible
> -     values.  */
> -  ptrdiff_t argc = 0;
> -  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
> -     to be used in a execve call.  We limit to INT_MAX minus one due 
> the
> -     compatiblity code that may execute a shell script 
> (maybe_script_execute)
> -     where it will construct another argument list with an additional
> -     argument.  */
> -  ptrdiff_t limit = INT_MAX - 1;
> -  while (argv[argc++] != NULL)
> -    if (argc == limit)
> -      {
> -	errno = E2BIG;
> -	return errno;
> -      }
> -
> -  int prot = (PROT_READ | PROT_WRITE
> -	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
> -
>    /* Add a slack area for child's stack.  */
> -  size_t argv_size = (argc * sizeof (void *)) + 512;
> +  size_t argv_size = (args->argc * sizeof (void *)) + 512;
>    /* We need at least a few pages in case the compiler's stack 
> checking is
>       enabled.  In some configs, it is known to use at least 24KiB.  We 
> use
>       32KiB to be "safe" from anything the compiler might do.  Besides, 
> the
> @@ -347,28 +333,12 @@ __spawnix (pid_t * pid, const char *file,
>       where it might use about 1k extra stack space).  */
>    argv_size += (32 * 1024);
>    size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
> +  int prot = (PROT_READ | PROT_WRITE
> +	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>    void *stack = __mmap (NULL, stack_size, prot,
>  			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
>    if (__glibc_unlikely (stack == MAP_FAILED))
> -    return errno;
> -
> -  /* Disable asynchronous cancellation.  */
> -  int state;
> -  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
> -
> -  /* Child must set args.err to something non-negative - we rely on
> -     the parent and child sharing VM.  */
> -  args.err = 0;
> -  args.file = file;
> -  args.exec = exec;
> -  args.fa = file_actions;
> -  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
> -  args.argv = argv;
> -  args.argc = argc;
> -  args.envp = envp;
> -  args.xflags = xflags;
> -
> -  __libc_signal_block_all (&args.oldmask);

The caller expects the error to be in "*ec".

> +    return true;
> 
>    /* The clone flags used will create a new child that will run in the 
> same
>       memory space (CLONE_VM) and the execution of calling thread will 
> be
> @@ -385,12 +355,12 @@ __spawnix (pid_t * pid, const char *file,
>        .stack = (uintptr_t) stack,
>        .stack_size = stack_size,
>      };
> -  new_pid = __clone_internal (&clone_args, __spawni_child, &args);
> +  *new_pid = __clone_internal (&clone_args, __spawni_child, args);
> 
>    /* It needs to collect the case where the auxiliary process was 
> created
>       but failed to execute the file (due either any preparation step 
> or
>       for execve itself).  */
> -  if (new_pid > 0)
> +  if (*new_pid > 0)
>      {
>        /* Also, it handles the unlikely case where the auxiliary 
> process was
>  	 terminated before calling execve as if it was successfully.  The
> @@ -398,21 +368,108 @@ __spawnix (pid_t * pid, const char *file,
>  	 only in case of failure, so in case of premature termination
>  	 due a signal args.err will remain zeroed and it will be up to
>  	 caller to actually collect it.  */
> -      ec = args.err;
> -      if (ec > 0)
> +      *ec = args->err;
> +      if (*ec > 0)
>  	/* There still an unlikely case where the child is cancelled after
>  	   setting args.err, due to a positive error value.  Also there is
>  	   possible pid reuse race (where the kernel allocated the same pid
>  	   to an unrelated process).  Unfortunately due synchronization
>  	   issues where the kernel might not have the process collected
>  	   the waitpid below can not use WNOHANG.  */
> -	__waitpid (new_pid, NULL, 0);
> +	__waitpid (*new_pid, NULL, 0);
>      }
>    else
> -    ec = errno;
> +    *ec = errno;
> 
>    __munmap (stack, stack_size);
> 
> +  /* There is no much point in retrying with fork and exec if kernel 
> returns a
> +     failure due resource exhaustion.  */
> +  return *new_pid > 0 || (errno != ENOMEM && errno != EAGAIN);
> +}
> +
> +/* Fallback spawn case which uses fork plus exec along with a pipe to
> +   communicate any helper process prepare pahse or execve failure.  */
> +static void
> +spawni_fork_fallback (struct posix_spawn_args *args, pid_t *new_pid, 
> int *ec)
> +{
> +  if (__pipe2 (args->pipe, O_CLOEXEC))

Same here: the caller expects the error to be in "*ec".

> +    return;
> +
> +  /* Do not trigger atfork handler nor any internal state reset since 
> the
> +     helper process will call execve.  */
> +  *new_pid = _Fork ();
> +  if (*new_pid == 0)
> +    __spawni_child (args);
> +  else if (*new_pid > 0)
> +    {
> +      __close (args->pipe[1]);

The write end should be closed on the error path too.

> +      if (__read (args->pipe[0], ec, sizeof *ec) != sizeof *ec)
> +	*ec = 0;
> +      else
> +	__waitpid (*new_pid, NULL, 0);
> +    }
> +  else
> +    *ec = errno;
> +
> +  __close (args->pipe[0]);
> +}
> +
> +/* Spawn a new process executing PATH with the attributes describes in 
> *ATTRP.
> +   Before running the process perform the actions described in 
> FILE-ACTIONS. */
> +static int
> +__spawnix (pid_t * pid, const char *file,
> +	   const posix_spawn_file_actions_t * file_actions,
> +	   const posix_spawnattr_t * attrp, char *const argv[],
> +	   char *const envp[], int xflags,
> +	   int (*exec) (const char *, char *const *, char *const *))
> +{
> +  /* To avoid imposing hard limits on posix_spawn{p} the total number 
> of
> +     arguments is first calculated to allocate a mmap to hold all 
> possible
> +     values.  */
> +  ptrdiff_t argc = 0;
> +  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
> +     to be used in a execve call.  We limit to INT_MAX minus one due 
> the
> +     compatiblity code that may execute a shell script 
> (maybe_script_execute)
> +     where it will construct another argument list with an additional
> +     argument.  */
> +  ptrdiff_t limit = INT_MAX - 1;
> +  while (argv[argc++] != NULL)
> +    if (argc == limit)
> +      {
> +	errno = E2BIG;
> +	return errno;
> +      }
> +
> +  /* Disable asynchronous cancellation.  */
> +  int state;
> +  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
> +
> +  /* Child must set args.err to something non-negative - we rely on
> +     the parent and child sharing VM.  */
> +  struct posix_spawn_args args;
> +  args.err = 0;
> +  args.file = file;
> +  args.exec = exec;
> +  args.fa = file_actions;
> +  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
> +  args.argv = argv;
> +  args.argc = argc;
> +  args.envp = envp;
> +  args.xflags = xflags;
> +  args.pipe[0] = args.pipe[1] = -1;
> +
> +  __libc_signal_block_all (&args.oldmask);
> +
> +  pid_t new_pid = 0;
> +  int ec = -1;
> +  /* clone with CLONE_VM | CLONE_VFORK may fail for some namespace 
> restriction
> +     (for instance Linux does not allow processes in different time 
> namespaces
> +     to share address space) and in this case clone fails with EINVAL. 
>  Retry
> +     with fork and exec.  */
> +  if (!spawni_clone (&args, &new_pid, &ec))
> +    spawni_fork_fallback (&args, &new_pid, &ec);
> +
>    if ((ec == 0) && (pid != NULL))
>      *pid = new_pid;

Shouldn't also __spawni_child() check whether posix_spawn actions are 
trying to do something with the error pipe write end (e.g. to close it, 
to dup() it or dup2() over it, etc.) and prevent that?

Alexey
Florian Weimer May 2, 2022, 7:30 p.m. UTC | #2
* Alexey Izbyshev:

>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) <
>> 0);
>
> If the parent is killed, write() will fail with EPIPE, so this loop
> will become infinite. Maybe check for EPIPE and break?

I think we also need to do something to avoid invoking a signal handler
for SIGPIPE here.

Thanks,
Florian
Alexey Izbyshev May 2, 2022, 7:37 p.m. UTC | #3
On 2022-05-02 22:30, Florian Weimer wrote:
> * Alexey Izbyshev:
> 
>>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) <
>>> 0);
>> 
>> If the parent is killed, write() will fail with EPIPE, so this loop
>> will become infinite. Maybe check for EPIPE and break?
> 
> I think we also need to do something to avoid invoking a signal handler
> for SIGPIPE here.
> 
All signal handlers are reset in the beginning of __spawni_child().

Alexey
Adhemerval Zanella Netto May 2, 2022, 7:42 p.m. UTC | #4
On 02/05/2022 16:15, Alexey Izbyshev wrote:
> On 2022-05-02 21:06, Adhemerval Zanella wrote:
>> Linux clone with CLONE_VM | CLONE_VFORK may fail for some namespace
>> restriction (for instance kernel does not allow processes in different
>> time namespaces to share the sameaddress space).
>>
>> In this case clone fails with EINVAL and thus posix_spawn can not spawn
>> a new process.  However the same process can be spawned with fork and exec.
>>
>> The patch fixes by issues _Fork and exec iff clone fails with a non
>> transient failure (ENOMEM and EAGAIN still returns failure to caller).
>>
>> Failure on prepare phase in helper process does not trigger the fork
>> and exec fallback.
>>
>> Checked on x86_64-linux-gnu.
>> ---
>>  sysdeps/unix/sysv/linux/spawni.c | 175 ++++++++++++++++++++-----------
>>  1 file changed, 116 insertions(+), 59 deletions(-)
>>
>> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
>> index d6f5ca89cd..b73715da7e 100644
>> --- a/sysdeps/unix/sysv/linux/spawni.c
>> +++ b/sysdeps/unix/sysv/linux/spawni.c
>> @@ -67,6 +67,7 @@ struct posix_spawn_args
>>    char *const *envp;
>>    int xflags;
>>    int err;
>> +  int pipe[2];
>>  };
>>
>>  /* Older version requires that shell script without shebang definition
>> @@ -103,6 +104,7 @@ __spawni_child (void *arguments)
>>    struct posix_spawn_args *args = arguments;
>>    const posix_spawnattr_t *restrict attr = args->attr;
>>    const posix_spawn_file_actions_t *file_actions = args->fa;
>> +  bool use_pipe = args->pipe[0] != -1;
>>
>>    /* The child must ensure that no signal handler are enabled because it shared
>>       memory with parent, so the signal disposition must be either SIG_DFL or
>> @@ -113,6 +115,9 @@ __spawni_child (void *arguments)
>>    struct sigaction sa;
>>    memset (&sa, '\0', sizeof (sa));
>>
>> +  if (use_pipe)
>> +    __close (args->pipe[0]);
>> +
>>    sigset_t hset;
>>    __sigprocmask (SIG_BLOCK, 0, &hset);
>>    for (int sig = 1; sig < _NSIG; ++sig)
>> @@ -300,45 +305,26 @@ fail:
>>       (EINTERNALBUG) describing that, use ECHILD.  Another option would
>>       be to set args->err to some negative sentinel and have the parent
>>       abort(), but that seems needlessly harsh.  */
>> -  args->err = errno ? : ECHILD;
>> +  int ret = errno ? : ECHILD;
>> +  if (use_pipe)
>> +    {
>> +      __close (args->pipe[0]);
> 
> The read end has already been closed above.

Ack.

> 
>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 0);
> 
> If the parent is killed, write() will fail with EPIPE, so this loop will become infinite. Maybe check for EPIPE and break?

We block all signals (__libc_signal_block_all), so the parent will be only killed
by signals that can no be caught (SIGKILL).  Same for SIGPIPE.

> 
>> +    }
>> +  else
>> +    args->err = ret;
>> +
>>    _exit (SPAWN_ERROR);
>>  }
>>
>> -/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
>> -   Before running the process perform the actions described in FILE-ACTIONS. */
>> -static int
>> -__spawnix (pid_t * pid, const char *file,
>> -       const posix_spawn_file_actions_t * file_actions,
>> -       const posix_spawnattr_t * attrp, char *const argv[],
>> -       char *const envp[], int xflags,
>> -       int (*exec) (const char *, char *const *, char *const *))
>> +/* Create a new process using clone with CLONE_VM | CLONE_VFORK to optimize
>> +   memory usage.  Return TRUE is the helper process could be spawned, or
>> +   FALSE otherwise.  */
>> +static bool
>> +spawni_clone (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
>>  {
>> -  pid_t new_pid;
>> -  struct posix_spawn_args args;
>> -  int ec;
>> -
>> -  /* To avoid imposing hard limits on posix_spawn{p} the total number of
>> -     arguments is first calculated to allocate a mmap to hold all possible
>> -     values.  */
>> -  ptrdiff_t argc = 0;
>> -  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
>> -     to be used in a execve call.  We limit to INT_MAX minus one due the
>> -     compatiblity code that may execute a shell script (maybe_script_execute)
>> -     where it will construct another argument list with an additional
>> -     argument.  */
>> -  ptrdiff_t limit = INT_MAX - 1;
>> -  while (argv[argc++] != NULL)
>> -    if (argc == limit)
>> -      {
>> -    errno = E2BIG;
>> -    return errno;
>> -      }
>> -
>> -  int prot = (PROT_READ | PROT_WRITE
>> -         | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>> -
>>    /* Add a slack area for child's stack.  */
>> -  size_t argv_size = (argc * sizeof (void *)) + 512;
>> +  size_t argv_size = (args->argc * sizeof (void *)) + 512;
>>    /* We need at least a few pages in case the compiler's stack checking is
>>       enabled.  In some configs, it is known to use at least 24KiB.  We use
>>       32KiB to be "safe" from anything the compiler might do.  Besides, the
>> @@ -347,28 +333,12 @@ __spawnix (pid_t * pid, const char *file,
>>       where it might use about 1k extra stack space).  */
>>    argv_size += (32 * 1024);
>>    size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
>> +  int prot = (PROT_READ | PROT_WRITE
>> +         | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>>    void *stack = __mmap (NULL, stack_size, prot,
>>              MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
>>    if (__glibc_unlikely (stack == MAP_FAILED))
>> -    return errno;
>> -
>> -  /* Disable asynchronous cancellation.  */
>> -  int state;
>> -  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
>> -
>> -  /* Child must set args.err to something non-negative - we rely on
>> -     the parent and child sharing VM.  */
>> -  args.err = 0;
>> -  args.file = file;
>> -  args.exec = exec;
>> -  args.fa = file_actions;
>> -  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
>> -  args.argv = argv;
>> -  args.argc = argc;
>> -  args.envp = envp;
>> -  args.xflags = xflags;
>> -
>> -  __libc_signal_block_all (&args.oldmask);
> 
> The caller expects the error to be in "*ec".

Ack.

>> +  /* There is no much point in retrying with fork and exec if kernel returns a
>> +     failure due resource exhaustion.  */
>> +  return *new_pid > 0 || (errno != ENOMEM && errno != EAGAIN);
>> +}
>> +
>> +/* Fallback spawn case which uses fork plus exec along with a pipe to
>> +   communicate any helper process prepare pahse or execve failure.  */
>> +static void
>> +spawni_fork_fallback (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
>> +{
>> +  if (__pipe2 (args->pipe, O_CLOEXEC))
> 
> Same here: the caller expects the error to be in "*ec".

Ack.

> 
>> +    return;
>> +
>> +  /* Do not trigger atfork handler nor any internal state reset since the
>> +     helper process will call execve.  */
>> +  *new_pid = _Fork ();
>> +  if (*new_pid == 0)
>> +    __spawni_child (args);
>> +  else if (*new_pid > 0)
>> +    {
>> +      __close (args->pipe[1]);
> 
> The write end should be closed on the error path too.

Ack, I moved the __close (args.pipe[1]) to be along __close (args.pipe[0]).


>> +  pid_t new_pid = 0;
>> +  int ec = -1;
>> +  /* clone with CLONE_VM | CLONE_VFORK may fail for some namespace restriction
>> +     (for instance Linux does not allow processes in different time namespaces
>> +     to share address space) and in this case clone fails with EINVAL.  Retry
>> +     with fork and exec.  */
>> +  if (!spawni_clone (&args, &new_pid, &ec))
>> +    spawni_fork_fallback (&args, &new_pid, &ec);
>> +
>>    if ((ec == 0) && (pid != NULL))
>>      *pid = new_pid;
> 
> Shouldn't also __spawni_child() check whether posix_spawn actions are trying to do something with the error pipe write end (e.g. to close it, to dup() it or dup2() over it, etc.) and prevent that?

Yeah we should, we need to either duplicate the file descriptor if a file operation
might clobber it and return EBADF for dup2 file action.
Alexey Izbyshev May 2, 2022, 7:54 p.m. UTC | #5
On 2022-05-02 22:42, Adhemerval Zanella wrote:
> On 02/05/2022 16:15, Alexey Izbyshev wrote:
>> On 2022-05-02 21:06, Adhemerval Zanella wrote:
>>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 
>>> 0);
>> 
>> If the parent is killed, write() will fail with EPIPE, so this loop 
>> will become infinite. Maybe check for EPIPE and break?
> 
> We block all signals (__libc_signal_block_all), so the parent will be
> only killed
> by signals that can no be caught (SIGKILL).  Same for SIGPIPE.

The parent can indeed only be killed by SIGKILL, or simply exit from 
another thread, but how exactly it dies doesn't matter. But signals are 
already unblocked at this point in the child, so if SIGPIPE was ignored 
in the parent, it's ignored now in the child and is unblocked, hence 
write() can fail with EPIPE.

Alexey
Adhemerval Zanella Netto May 2, 2022, 7:57 p.m. UTC | #6
On 02/05/2022 16:42, Adhemerval Zanella wrote:
> 
> 
> On 02/05/2022 16:15, Alexey Izbyshev wrote:
>> On 2022-05-02 21:06, Adhemerval Zanella wrote:
>>> Linux clone with CLONE_VM | CLONE_VFORK may fail for some namespace
>>> restriction (for instance kernel does not allow processes in different
>>> time namespaces to share the sameaddress space).
>>>
>>> In this case clone fails with EINVAL and thus posix_spawn can not spawn
>>> a new process.  However the same process can be spawned with fork and exec.
>>>
>>> The patch fixes by issues _Fork and exec iff clone fails with a non
>>> transient failure (ENOMEM and EAGAIN still returns failure to caller).
>>>
>>> Failure on prepare phase in helper process does not trigger the fork
>>> and exec fallback.
>>>
>>> Checked on x86_64-linux-gnu.
>>> ---
>>>  sysdeps/unix/sysv/linux/spawni.c | 175 ++++++++++++++++++++-----------
>>>  1 file changed, 116 insertions(+), 59 deletions(-)
>>>
>>> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
>>> index d6f5ca89cd..b73715da7e 100644
>>> --- a/sysdeps/unix/sysv/linux/spawni.c
>>> +++ b/sysdeps/unix/sysv/linux/spawni.c
>>> @@ -67,6 +67,7 @@ struct posix_spawn_args
>>>    char *const *envp;
>>>    int xflags;
>>>    int err;
>>> +  int pipe[2];
>>>  };
>>>
>>>  /* Older version requires that shell script without shebang definition
>>> @@ -103,6 +104,7 @@ __spawni_child (void *arguments)
>>>    struct posix_spawn_args *args = arguments;
>>>    const posix_spawnattr_t *restrict attr = args->attr;
>>>    const posix_spawn_file_actions_t *file_actions = args->fa;
>>> +  bool use_pipe = args->pipe[0] != -1;
>>>
>>>    /* The child must ensure that no signal handler are enabled because it shared
>>>       memory with parent, so the signal disposition must be either SIG_DFL or
>>> @@ -113,6 +115,9 @@ __spawni_child (void *arguments)
>>>    struct sigaction sa;
>>>    memset (&sa, '\0', sizeof (sa));
>>>
>>> +  if (use_pipe)
>>> +    __close (args->pipe[0]);
>>> +
>>>    sigset_t hset;
>>>    __sigprocmask (SIG_BLOCK, 0, &hset);
>>>    for (int sig = 1; sig < _NSIG; ++sig)
>>> @@ -300,45 +305,26 @@ fail:
>>>       (EINTERNALBUG) describing that, use ECHILD.  Another option would
>>>       be to set args->err to some negative sentinel and have the parent
>>>       abort(), but that seems needlessly harsh.  */
>>> -  args->err = errno ? : ECHILD;
>>> +  int ret = errno ? : ECHILD;
>>> +  if (use_pipe)
>>> +    {
>>> +      __close (args->pipe[0]);
>>
>> The read end has already been closed above.
> 
> Ack.
> 
>>
>>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 0);
>>
>> If the parent is killed, write() will fail with EPIPE, so this loop will become infinite. Maybe check for EPIPE and break?
> 
> We block all signals (__libc_signal_block_all), so the parent will be only killed
> by signals that can no be caught (SIGKILL).  Same for SIGPIPE.
> 
>>
>>> +    }
>>> +  else
>>> +    args->err = ret;
>>> +
>>>    _exit (SPAWN_ERROR);
>>>  }
>>>
>>> -/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
>>> -   Before running the process perform the actions described in FILE-ACTIONS. */
>>> -static int
>>> -__spawnix (pid_t * pid, const char *file,
>>> -       const posix_spawn_file_actions_t * file_actions,
>>> -       const posix_spawnattr_t * attrp, char *const argv[],
>>> -       char *const envp[], int xflags,
>>> -       int (*exec) (const char *, char *const *, char *const *))
>>> +/* Create a new process using clone with CLONE_VM | CLONE_VFORK to optimize
>>> +   memory usage.  Return TRUE is the helper process could be spawned, or
>>> +   FALSE otherwise.  */
>>> +static bool
>>> +spawni_clone (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
>>>  {
>>> -  pid_t new_pid;
>>> -  struct posix_spawn_args args;
>>> -  int ec;
>>> -
>>> -  /* To avoid imposing hard limits on posix_spawn{p} the total number of
>>> -     arguments is first calculated to allocate a mmap to hold all possible
>>> -     values.  */
>>> -  ptrdiff_t argc = 0;
>>> -  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
>>> -     to be used in a execve call.  We limit to INT_MAX minus one due the
>>> -     compatiblity code that may execute a shell script (maybe_script_execute)
>>> -     where it will construct another argument list with an additional
>>> -     argument.  */
>>> -  ptrdiff_t limit = INT_MAX - 1;
>>> -  while (argv[argc++] != NULL)
>>> -    if (argc == limit)
>>> -      {
>>> -    errno = E2BIG;
>>> -    return errno;
>>> -      }
>>> -
>>> -  int prot = (PROT_READ | PROT_WRITE
>>> -         | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>>> -
>>>    /* Add a slack area for child's stack.  */
>>> -  size_t argv_size = (argc * sizeof (void *)) + 512;
>>> +  size_t argv_size = (args->argc * sizeof (void *)) + 512;
>>>    /* We need at least a few pages in case the compiler's stack checking is
>>>       enabled.  In some configs, it is known to use at least 24KiB.  We use
>>>       32KiB to be "safe" from anything the compiler might do.  Besides, the
>>> @@ -347,28 +333,12 @@ __spawnix (pid_t * pid, const char *file,
>>>       where it might use about 1k extra stack space).  */
>>>    argv_size += (32 * 1024);
>>>    size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
>>> +  int prot = (PROT_READ | PROT_WRITE
>>> +         | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>>>    void *stack = __mmap (NULL, stack_size, prot,
>>>              MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
>>>    if (__glibc_unlikely (stack == MAP_FAILED))
>>> -    return errno;
>>> -
>>> -  /* Disable asynchronous cancellation.  */
>>> -  int state;
>>> -  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
>>> -
>>> -  /* Child must set args.err to something non-negative - we rely on
>>> -     the parent and child sharing VM.  */
>>> -  args.err = 0;
>>> -  args.file = file;
>>> -  args.exec = exec;
>>> -  args.fa = file_actions;
>>> -  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
>>> -  args.argv = argv;
>>> -  args.argc = argc;
>>> -  args.envp = envp;
>>> -  args.xflags = xflags;
>>> -
>>> -  __libc_signal_block_all (&args.oldmask);
>>
>> The caller expects the error to be in "*ec".
> 
> Ack.
> 
>>> +  /* There is no much point in retrying with fork and exec if kernel returns a
>>> +     failure due resource exhaustion.  */
>>> +  return *new_pid > 0 || (errno != ENOMEM && errno != EAGAIN);
>>> +}
>>> +
>>> +/* Fallback spawn case which uses fork plus exec along with a pipe to
>>> +   communicate any helper process prepare pahse or execve failure.  */
>>> +static void
>>> +spawni_fork_fallback (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
>>> +{
>>> +  if (__pipe2 (args->pipe, O_CLOEXEC))
>>
>> Same here: the caller expects the error to be in "*ec".
> 
> Ack.
> 
>>
>>> +    return;
>>> +
>>> +  /* Do not trigger atfork handler nor any internal state reset since the
>>> +     helper process will call execve.  */
>>> +  *new_pid = _Fork ();
>>> +  if (*new_pid == 0)
>>> +    __spawni_child (args);
>>> +  else if (*new_pid > 0)
>>> +    {
>>> +      __close (args->pipe[1]);
>>
>> The write end should be closed on the error path too.
> 
> Ack, I moved the __close (args.pipe[1]) to be along __close (args.pipe[0]).
> 
> 
>>> +  pid_t new_pid = 0;
>>> +  int ec = -1;
>>> +  /* clone with CLONE_VM | CLONE_VFORK may fail for some namespace restriction
>>> +     (for instance Linux does not allow processes in different time namespaces
>>> +     to share address space) and in this case clone fails with EINVAL.  Retry
>>> +     with fork and exec.  */
>>> +  if (!spawni_clone (&args, &new_pid, &ec))
>>> +    spawni_fork_fallback (&args, &new_pid, &ec);
>>> +
>>>    if ((ec == 0) && (pid != NULL))
>>>      *pid = new_pid;
>>
>> Shouldn't also __spawni_child() check whether posix_spawn actions are trying to do something with the error pipe write end (e.g. to close it, to dup() it or dup2() over it, etc.) and prevent that?
> 
> Yeah we should, we need to either duplicate the file descriptor if a file operation
> might clobber it and return EBADF for dup2 file action.

And using a pipe brings another issue with closefrom where the file action might
close the pipe's end and using a dup will help much (since we can not be sure
if the duplicate file descriptor will be lower than the closefrom value).  Maybe
using mmap with MAP_SHARED would be better here.
Alexey Izbyshev May 2, 2022, 8:02 p.m. UTC | #7
On 2022-05-02 22:57, Adhemerval Zanella wrote:
> On 02/05/2022 16:42, Adhemerval Zanella wrote:
>> 
>> 
>> On 02/05/2022 16:15, Alexey Izbyshev wrote:
>>> Shouldn't also __spawni_child() check whether posix_spawn actions are 
>>> trying to do something with the error pipe write end (e.g. to close 
>>> it, to dup() it or dup2() over it, etc.) and prevent that?
>> 
>> Yeah we should, we need to either duplicate the file descriptor if a 
>> file operation
>> might clobber it and return EBADF for dup2 file action.
> 
> And using a pipe brings another issue with closefrom where the file 
> action might
> close the pipe's end and using a dup will help much (since we can not 
> be sure
> if the duplicate file descriptor will be lower than the closefrom 
> value).  Maybe
> using mmap with MAP_SHARED would be better here.

But closefrom() is implemented via close_range() on Linux, so can we 
just close_range(start, pipe_fd - 1) and close_range(pipe_fd + 1, 
INT_MAX)? (Ignoring edge conditions here).

Alexey
Adhemerval Zanella Netto May 2, 2022, 8:02 p.m. UTC | #8
On 02/05/2022 16:54, Alexey Izbyshev wrote:
> On 2022-05-02 22:42, Adhemerval Zanella wrote:
>> On 02/05/2022 16:15, Alexey Izbyshev wrote:
>>> On 2022-05-02 21:06, Adhemerval Zanella wrote:
>>>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 0);
>>>
>>> If the parent is killed, write() will fail with EPIPE, so this loop will become infinite. Maybe check for EPIPE and break?
>>
>> We block all signals (__libc_signal_block_all), so the parent will be
>> only killed
>> by signals that can no be caught (SIGKILL).  Same for SIGPIPE.
> 
> The parent can indeed only be killed by SIGKILL, or simply exit from another thread, but how exactly it dies doesn't matter. But signals are already unblocked at this point in the child, so if SIGPIPE was ignored in the parent, it's ignored now in the child and is unblocked, hence write() can fail with EPIPE.
> 

Right, I was thinking in term of the CLONE_VFORK which is not the case.  But
now checking on the closefrom and the pipe file descriptor issue, it seems
it would be better to have the error code communicate through a shared memory
instead of a pipe.
Adhemerval Zanella Netto May 2, 2022, 8:04 p.m. UTC | #9
On 02/05/2022 17:02, Alexey Izbyshev wrote:
> On 2022-05-02 22:57, Adhemerval Zanella wrote:
>> On 02/05/2022 16:42, Adhemerval Zanella wrote:
>>>
>>>
>>> On 02/05/2022 16:15, Alexey Izbyshev wrote:
>>>> Shouldn't also __spawni_child() check whether posix_spawn actions are trying to do something with the error pipe write end (e.g. to close it, to dup() it or dup2() over it, etc.) and prevent that?
>>>
>>> Yeah we should, we need to either duplicate the file descriptor if a file operation
>>> might clobber it and return EBADF for dup2 file action.
>>
>> And using a pipe brings another issue with closefrom where the file action might
>> close the pipe's end and using a dup will help much (since we can not be sure
>> if the duplicate file descriptor will be lower than the closefrom value).  Maybe
>> using mmap with MAP_SHARED would be better here.
> 
> But closefrom() is implemented via close_range() on Linux, so can we just close_range(start, pipe_fd - 1) and close_range(pipe_fd + 1, INT_MAX)? (Ignoring edge conditions here).

It should work indeed.
Florian Weimer May 2, 2022, 8:05 p.m. UTC | #10
* Alexey Izbyshev:

> On 2022-05-02 22:30, Florian Weimer wrote:
>> * Alexey Izbyshev:
>> 
>>>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) <
>>>> 0);
>>> If the parent is killed, write() will fail with EPIPE, so this loop
>>> will become infinite. Maybe check for EPIPE and break?
>> I think we also need to do something to avoid invoking a signal
>> handler
>> for SIGPIPE here.
>> 
> All signal handlers are reset in the beginning of __spawni_child().

Then maybe add a comment to this effect?

Thanks,
Florian
Alexey Izbyshev May 2, 2022, 8:08 p.m. UTC | #11
On 2022-05-02 23:02, Adhemerval Zanella wrote:
> On 02/05/2022 16:54, Alexey Izbyshev wrote:
>> On 2022-05-02 22:42, Adhemerval Zanella wrote:
>>> On 02/05/2022 16:15, Alexey Izbyshev wrote:
>>>> On 2022-05-02 21:06, Adhemerval Zanella wrote:
>>>>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) 
>>>>> < 0);
>>>> 
>>>> If the parent is killed, write() will fail with EPIPE, so this loop 
>>>> will become infinite. Maybe check for EPIPE and break?
>>> 
>>> We block all signals (__libc_signal_block_all), so the parent will be
>>> only killed
>>> by signals that can no be caught (SIGKILL).  Same for SIGPIPE.
>> 
>> The parent can indeed only be killed by SIGKILL, or simply exit from 
>> another thread, but how exactly it dies doesn't matter. But signals 
>> are already unblocked at this point in the child, so if SIGPIPE was 
>> ignored in the parent, it's ignored now in the child and is unblocked, 
>> hence write() can fail with EPIPE.
>> 
> 
> Right, I was thinking in term of the CLONE_VFORK which is not the case. 
>  But
> now checking on the closefrom and the pipe file descriptor issue, it 
> seems
> it would be better to have the error code communicate through a shared 
> memory
> instead of a pipe.

How do you propose to wait until the child exec's if you use shared 
memory? Do you plan to use clone(CLONE_VFORK) (but without CLONE_VM)?

Alexey
Adhemerval Zanella Netto May 2, 2022, 8:09 p.m. UTC | #12
On 02/05/2022 17:08, Alexey Izbyshev wrote:
> On 2022-05-02 23:02, Adhemerval Zanella wrote:
>> On 02/05/2022 16:54, Alexey Izbyshev wrote:
>>> On 2022-05-02 22:42, Adhemerval Zanella wrote:
>>>> On 02/05/2022 16:15, Alexey Izbyshev wrote:
>>>>> On 2022-05-02 21:06, Adhemerval Zanella wrote:
>>>>>> +      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 0);
>>>>>
>>>>> If the parent is killed, write() will fail with EPIPE, so this loop will become infinite. Maybe check for EPIPE and break?
>>>>
>>>> We block all signals (__libc_signal_block_all), so the parent will be
>>>> only killed
>>>> by signals that can no be caught (SIGKILL).  Same for SIGPIPE.
>>>
>>> The parent can indeed only be killed by SIGKILL, or simply exit from another thread, but how exactly it dies doesn't matter. But signals are already unblocked at this point in the child, so if SIGPIPE was ignored in the parent, it's ignored now in the child and is unblocked, hence write() can fail with EPIPE.
>>>
>>
>> Right, I was thinking in term of the CLONE_VFORK which is not the case.  But
>> now checking on the closefrom and the pipe file descriptor issue, it seems
>> it would be better to have the error code communicate through a shared memory
>> instead of a pipe.
> 
> How do you propose to wait until the child exec's if you use shared memory? Do you plan to use clone(CLONE_VFORK) (but without CLONE_VM)?

Your suggestion to use a two clone_range calls is way simpler than a shared map.
diff mbox series

Patch

diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index d6f5ca89cd..b73715da7e 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -67,6 +67,7 @@  struct posix_spawn_args
   char *const *envp;
   int xflags;
   int err;
+  int pipe[2];
 };
 
 /* Older version requires that shell script without shebang definition
@@ -103,6 +104,7 @@  __spawni_child (void *arguments)
   struct posix_spawn_args *args = arguments;
   const posix_spawnattr_t *restrict attr = args->attr;
   const posix_spawn_file_actions_t *file_actions = args->fa;
+  bool use_pipe = args->pipe[0] != -1;
 
   /* The child must ensure that no signal handler are enabled because it shared
      memory with parent, so the signal disposition must be either SIG_DFL or
@@ -113,6 +115,9 @@  __spawni_child (void *arguments)
   struct sigaction sa;
   memset (&sa, '\0', sizeof (sa));
 
+  if (use_pipe)
+    __close (args->pipe[0]);
+
   sigset_t hset;
   __sigprocmask (SIG_BLOCK, 0, &hset);
   for (int sig = 1; sig < _NSIG; ++sig)
@@ -300,45 +305,26 @@  fail:
      (EINTERNALBUG) describing that, use ECHILD.  Another option would
      be to set args->err to some negative sentinel and have the parent
      abort(), but that seems needlessly harsh.  */
-  args->err = errno ? : ECHILD;
+  int ret = errno ? : ECHILD;
+  if (use_pipe)
+    {
+      __close (args->pipe[0]);
+      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 0);
+    }
+  else
+    args->err = ret;
+
   _exit (SPAWN_ERROR);
 }
 
-/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
-   Before running the process perform the actions described in FILE-ACTIONS. */
-static int
-__spawnix (pid_t * pid, const char *file,
-	   const posix_spawn_file_actions_t * file_actions,
-	   const posix_spawnattr_t * attrp, char *const argv[],
-	   char *const envp[], int xflags,
-	   int (*exec) (const char *, char *const *, char *const *))
+/* Create a new process using clone with CLONE_VM | CLONE_VFORK to optimize
+   memory usage.  Return TRUE is the helper process could be spawned, or
+   FALSE otherwise.  */
+static bool
+spawni_clone (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
 {
-  pid_t new_pid;
-  struct posix_spawn_args args;
-  int ec;
-
-  /* To avoid imposing hard limits on posix_spawn{p} the total number of
-     arguments is first calculated to allocate a mmap to hold all possible
-     values.  */
-  ptrdiff_t argc = 0;
-  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
-     to be used in a execve call.  We limit to INT_MAX minus one due the
-     compatiblity code that may execute a shell script (maybe_script_execute)
-     where it will construct another argument list with an additional
-     argument.  */
-  ptrdiff_t limit = INT_MAX - 1;
-  while (argv[argc++] != NULL)
-    if (argc == limit)
-      {
-	errno = E2BIG;
-	return errno;
-      }
-
-  int prot = (PROT_READ | PROT_WRITE
-	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
-
   /* Add a slack area for child's stack.  */
-  size_t argv_size = (argc * sizeof (void *)) + 512;
+  size_t argv_size = (args->argc * sizeof (void *)) + 512;
   /* We need at least a few pages in case the compiler's stack checking is
      enabled.  In some configs, it is known to use at least 24KiB.  We use
      32KiB to be "safe" from anything the compiler might do.  Besides, the
@@ -347,28 +333,12 @@  __spawnix (pid_t * pid, const char *file,
      where it might use about 1k extra stack space).  */
   argv_size += (32 * 1024);
   size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
+  int prot = (PROT_READ | PROT_WRITE
+	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
   void *stack = __mmap (NULL, stack_size, prot,
 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
   if (__glibc_unlikely (stack == MAP_FAILED))
-    return errno;
-
-  /* Disable asynchronous cancellation.  */
-  int state;
-  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
-
-  /* Child must set args.err to something non-negative - we rely on
-     the parent and child sharing VM.  */
-  args.err = 0;
-  args.file = file;
-  args.exec = exec;
-  args.fa = file_actions;
-  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
-  args.argv = argv;
-  args.argc = argc;
-  args.envp = envp;
-  args.xflags = xflags;
-
-  __libc_signal_block_all (&args.oldmask);
+    return true;
 
   /* The clone flags used will create a new child that will run in the same
      memory space (CLONE_VM) and the execution of calling thread will be
@@ -385,12 +355,12 @@  __spawnix (pid_t * pid, const char *file,
       .stack = (uintptr_t) stack,
       .stack_size = stack_size,
     };
-  new_pid = __clone_internal (&clone_args, __spawni_child, &args);
+  *new_pid = __clone_internal (&clone_args, __spawni_child, args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or
      for execve itself).  */
-  if (new_pid > 0)
+  if (*new_pid > 0)
     {
       /* Also, it handles the unlikely case where the auxiliary process was
 	 terminated before calling execve as if it was successfully.  The
@@ -398,21 +368,108 @@  __spawnix (pid_t * pid, const char *file,
 	 only in case of failure, so in case of premature termination
 	 due a signal args.err will remain zeroed and it will be up to
 	 caller to actually collect it.  */
-      ec = args.err;
-      if (ec > 0)
+      *ec = args->err;
+      if (*ec > 0)
 	/* There still an unlikely case where the child is cancelled after
 	   setting args.err, due to a positive error value.  Also there is
 	   possible pid reuse race (where the kernel allocated the same pid
 	   to an unrelated process).  Unfortunately due synchronization
 	   issues where the kernel might not have the process collected
 	   the waitpid below can not use WNOHANG.  */
-	__waitpid (new_pid, NULL, 0);
+	__waitpid (*new_pid, NULL, 0);
     }
   else
-    ec = errno;
+    *ec = errno;
 
   __munmap (stack, stack_size);
 
+  /* There is no much point in retrying with fork and exec if kernel returns a
+     failure due resource exhaustion.  */
+  return *new_pid > 0 || (errno != ENOMEM && errno != EAGAIN);
+}
+
+/* Fallback spawn case which uses fork plus exec along with a pipe to
+   communicate any helper process prepare pahse or execve failure.  */
+static void
+spawni_fork_fallback (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
+{
+  if (__pipe2 (args->pipe, O_CLOEXEC))
+    return;
+
+  /* Do not trigger atfork handler nor any internal state reset since the
+     helper process will call execve.  */
+  *new_pid = _Fork ();
+  if (*new_pid == 0)
+    __spawni_child (args);
+  else if (*new_pid > 0)
+    {
+      __close (args->pipe[1]);
+      if (__read (args->pipe[0], ec, sizeof *ec) != sizeof *ec)
+	*ec = 0;
+      else
+	__waitpid (*new_pid, NULL, 0);
+    }
+  else
+    *ec = errno;
+
+  __close (args->pipe[0]);
+}
+
+/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
+   Before running the process perform the actions described in FILE-ACTIONS. */
+static int
+__spawnix (pid_t * pid, const char *file,
+	   const posix_spawn_file_actions_t * file_actions,
+	   const posix_spawnattr_t * attrp, char *const argv[],
+	   char *const envp[], int xflags,
+	   int (*exec) (const char *, char *const *, char *const *))
+{
+  /* To avoid imposing hard limits on posix_spawn{p} the total number of
+     arguments is first calculated to allocate a mmap to hold all possible
+     values.  */
+  ptrdiff_t argc = 0;
+  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
+     to be used in a execve call.  We limit to INT_MAX minus one due the
+     compatiblity code that may execute a shell script (maybe_script_execute)
+     where it will construct another argument list with an additional
+     argument.  */
+  ptrdiff_t limit = INT_MAX - 1;
+  while (argv[argc++] != NULL)
+    if (argc == limit)
+      {
+	errno = E2BIG;
+	return errno;
+      }
+
+  /* Disable asynchronous cancellation.  */
+  int state;
+  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
+
+  /* Child must set args.err to something non-negative - we rely on
+     the parent and child sharing VM.  */
+  struct posix_spawn_args args;
+  args.err = 0;
+  args.file = file;
+  args.exec = exec;
+  args.fa = file_actions;
+  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
+  args.argv = argv;
+  args.argc = argc;
+  args.envp = envp;
+  args.xflags = xflags;
+  args.pipe[0] = args.pipe[1] = -1;
+
+  __libc_signal_block_all (&args.oldmask);
+
+  pid_t new_pid = 0;
+  int ec = -1;
+  /* clone with CLONE_VM | CLONE_VFORK may fail for some namespace restriction
+     (for instance Linux does not allow processes in different time namespaces
+     to share address space) and in this case clone fails with EINVAL.  Retry
+     with fork and exec.  */
+  if (!spawni_clone (&args, &new_pid, &ec))
+    spawni_fork_fallback (&args, &new_pid, &ec);
+
   if ((ec == 0) && (pid != NULL))
     *pid = new_pid;