diff mbox series

[v4,1/2] Add an internal wrapper for clone, clone2 and clone3

Message ID 20210511014356.235353-2-hjl.tools@gmail.com
State New
Headers show
Series Add an internal wrapper for clone, clone2 and clone3 | expand

Commit Message

H.J. Lu May 11, 2021, 1:43 a.m. UTC
The clone3 system call provides a superset of the functionality of clone
and clone2.  It also provides a number of API improve ments, including
the ability to specify the size of the child's stack area which can be
used by kernel to compute the shadow stack size when allocating the
shadow stack.  Add:

extern int __clone_internal (struct clone_args *cl_args,
			     int (*__fn) (void *__arg), void *__arg);

to provide an abstract interface for clone, clone2 and clone3.

1. Add cast_to_pointer to cast an integer to void * pointer.
2. Simplify stack management for thread creation by passing both stack
base and size to create_thread.
3. Consolidate clone vs clone2 differences into a single file.
4. Use only __clone_internal to clone a thread.  If clone3 returns -1
with ENOSYS, __clone_internal will fall back to clone or clone2.
5. Add the x86-64 clone3 wrapper.
6. Enable the public clone3 wrapper in the future after it has been
added to all targets.

Tested with build-many-glibcs.py.  The x86-64 clone3 wrapper and the
i686 clone fallback are tested under 5.12.2 kernel.
---
 include/clone3.h                          | 14 ++++
 include/libc-pointer-arith.h              |  3 +
 sysdeps/unix/sysv/linux/Makefile          |  4 +-
 sysdeps/unix/sysv/linux/Versions          |  1 +
 sysdeps/unix/sysv/linux/clone-internal.c  | 80 ++++++++++++++++++++
 sysdeps/unix/sysv/linux/clone-offsets.sym |  5 ++
 sysdeps/unix/sysv/linux/clone3.c          | 34 +++++++++
 sysdeps/unix/sysv/linux/clone3.h          | 58 ++++++++++++++
 sysdeps/unix/sysv/linux/createthread.c    | 25 +++---
 sysdeps/unix/sysv/linux/spawni.c          | 26 +++----
 sysdeps/unix/sysv/linux/tst-align-clone.c | 55 +++++++++++---
 sysdeps/unix/sysv/linux/tst-clone.c       | 23 +++++-
 sysdeps/unix/sysv/linux/tst-clone2.c      | 61 +++++++++++----
 sysdeps/unix/sysv/linux/tst-clone3.c      | 60 +++++++++++----
 sysdeps/unix/sysv/linux/tst-getpid1.c     | 54 ++++++++++---
 sysdeps/unix/sysv/linux/x86_64/clone3.S   | 92 +++++++++++++++++++++++
 16 files changed, 517 insertions(+), 78 deletions(-)
 create mode 100644 include/clone3.h
 create mode 100644 sysdeps/unix/sysv/linux/clone-internal.c
 create mode 100644 sysdeps/unix/sysv/linux/clone-offsets.sym
 create mode 100644 sysdeps/unix/sysv/linux/clone3.c
 create mode 100644 sysdeps/unix/sysv/linux/clone3.h
 create mode 100644 sysdeps/unix/sysv/linux/x86_64/clone3.S

Comments

Adhemerval Zanella Netto May 13, 2021, 2:51 p.m. UTC | #1
On 10/05/2021 22:43, H.J. Lu via Libc-alpha wrote:
> The clone3 system call provides a superset of the functionality of clone
> and clone2.  It also provides a number of API improve ments, including
> the ability to specify the size of the child's stack area which can be
> used by kernel to compute the shadow stack size when allocating the
> shadow stack.  Add:
> 
> extern int __clone_internal (struct clone_args *cl_args,
> 			     int (*__fn) (void *__arg), void *__arg);
> 
> to provide an abstract interface for clone, clone2 and clone3.
> 
> 1. Add cast_to_pointer to cast an integer to void * pointer.
> 2. Simplify stack management for thread creation by passing both stack
> base and size to create_thread.
> 3. Consolidate clone vs clone2 differences into a single file.
> 4. Use only __clone_internal to clone a thread.  If clone3 returns -1
> with ENOSYS, __clone_internal will fall back to clone or clone2.
> 5. Add the x86-64 clone3 wrapper.
> 6. Enable the public clone3 wrapper in the future after it has been
> added to all targets.
> 

This patch is doing both a refactoring to internal clone usage and
adding the clone3.  I think it would be better to split two patches:
changing the internal clone calls to use the clone_internal and then
add clone3 support along with x86_64 support.

There are some comments below.

> Tested with build-many-glibcs.py.  The x86-64 clone3 wrapper and the
> i686 clone fallback are tested under 5.12.2 kernel.
> ---
>  include/clone3.h                          | 14 ++++
>  include/libc-pointer-arith.h              |  3 +
>  sysdeps/unix/sysv/linux/Makefile          |  4 +-
>  sysdeps/unix/sysv/linux/Versions          |  1 +
>  sysdeps/unix/sysv/linux/clone-internal.c  | 80 ++++++++++++++++++++
>  sysdeps/unix/sysv/linux/clone-offsets.sym |  5 ++
>  sysdeps/unix/sysv/linux/clone3.c          | 34 +++++++++
>  sysdeps/unix/sysv/linux/clone3.h          | 58 ++++++++++++++
>  sysdeps/unix/sysv/linux/createthread.c    | 25 +++---
>  sysdeps/unix/sysv/linux/spawni.c          | 26 +++----
>  sysdeps/unix/sysv/linux/tst-align-clone.c | 55 +++++++++++---
>  sysdeps/unix/sysv/linux/tst-clone.c       | 23 +++++-
>  sysdeps/unix/sysv/linux/tst-clone2.c      | 61 +++++++++++----
>  sysdeps/unix/sysv/linux/tst-clone3.c      | 60 +++++++++++----
>  sysdeps/unix/sysv/linux/tst-getpid1.c     | 54 ++++++++++---
>  sysdeps/unix/sysv/linux/x86_64/clone3.S   | 92 +++++++++++++++++++++++
>  16 files changed, 517 insertions(+), 78 deletions(-)
>  create mode 100644 include/clone3.h
>  create mode 100644 sysdeps/unix/sysv/linux/clone-internal.c
>  create mode 100644 sysdeps/unix/sysv/linux/clone-offsets.sym
>  create mode 100644 sysdeps/unix/sysv/linux/clone3.c
>  create mode 100644 sysdeps/unix/sysv/linux/clone3.h
>  create mode 100644 sysdeps/unix/sysv/linux/x86_64/clone3.S
> 
> diff --git a/include/clone3.h b/include/clone3.h
> new file mode 100644
> index 0000000000..124f7ba169
> --- /dev/null
> +++ b/include/clone3.h
> @@ -0,0 +1,14 @@
> +#ifndef _CLONE3_H
> +#include_next <clone3.h>
> +
> +extern __typeof (clone3) __clone3;
> +
> +/* The internal wrapper of clone and clone3.  */
> +extern __typeof (clone3) __clone_internal;
> +
> +#ifndef _ISOMAC
> +libc_hidden_proto (__clone3)
> +libc_hidden_proto (__clone_internal)
> +#endif
> +
> +#endif

I would use clone_internal.h here (clone3.h is currently .

> diff --git a/include/libc-pointer-arith.h b/include/libc-pointer-arith.h
> index 72e722c5aa..04ba537617 100644
> --- a/include/libc-pointer-arith.h
> +++ b/include/libc-pointer-arith.h
> @@ -37,6 +37,9 @@
>  /* Cast an integer or a pointer VAL to integer with proper type.  */
>  # define cast_to_integer(val) ((__integer_if_pointer_type (val)) (val))
>  
> +/* Cast an integer VAL to void * pointer.  */
> +# define cast_to_pointer(val) ((void *) (uintptr_t) (val))
> +
>  /* Align a value by rounding down to closest size.
>     e.g. Using size of 4096, we get this behavior:
>  	{4095, 4096, 4097} = {0, 4096, 4096}.  */

Ok.

> diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> index fb155cf856..7c1f32b84d 100644
> --- a/sysdeps/unix/sysv/linux/Makefile
> +++ b/sysdeps/unix/sysv/linux/Makefile
> @@ -54,6 +54,8 @@ CFLAGS-malloc.c += -DMORECORE_CLEARS=2
>  endif
>  
>  ifeq ($(subdir),misc)
> +gen-as-const-headers += clone-offsets.sym
> +
>  sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
>  		   setfsuid setfsgid epoll_pwait signalfd \
>  		   eventfd eventfd_read eventfd_write prlimit \
> @@ -64,7 +66,7 @@ sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
>  		   time64-support pselect32 \
>  		   xstat fxstat lxstat xstat64 fxstat64 lxstat64 \
>  		   fxstatat fxstatat64 \
> -		   xmknod xmknodat
> +		   xmknod xmknodat clone3 clone-internal
>  
>  CFLAGS-gethostid.c = -fexceptions
>  CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables

Ok.

> diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
> index 220bb2dffe..299d4fef9c 100644
> --- a/sysdeps/unix/sysv/linux/Versions
> +++ b/sysdeps/unix/sysv/linux/Versions
> @@ -179,6 +179,7 @@ libc {
>      __sigtimedwait;
>      # functions used by nscd
>      __netlink_assert_response;
> +    __clone_internal;
>    }
>  }
>  

Ok, it is used by pthread_create.

> diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
> new file mode 100644
> index 0000000000..acafb78d81
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone-internal.c
> @@ -0,0 +1,80 @@
> +/* The internal wrapper of clone and clone3.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <sched.h>
> +#include <clone3.h>
> +#include <libc-pointer-arith.h>	/* For cast_to_pointer.  */
> +#include <stackinfo.h>		/* For _STACK_GROWS_{UP,DOWN}.  */
> +
> +int
> +__clone_internal (struct clone_args *cl_args,
> +		  int (*fn) (void *arg), void *arg)
> +{
> +  int ret;
> +#ifdef CLONE_ARGS_SIZE_VER0

I think we should issue clone3 regardless of CLONE_ARGS_SIZE_VER0 is 
defined (as we do for all other syscalls).  We could use the define to 
add similar checks the kernel does, to assure our defined struct is 
similar to the one expected by the kernel:

| #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
| #define offsetofend(TYPE, MEMBER) \
|         (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
|
| #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
| #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
| #define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
|
| _Static_assert (offsetofend(struct clone_args, tls) == CLONE_ARGS_SIZE_VER0,
|		  "offsetofend(struct clone_args, tls) != CLONE_ARGS_SIZE_VER0");
| _Static_assert (offsetofend(struct clone_args, set_tid_size) == CLONE_ARGS_SIZE_VER1,
|		  "offsetofend(struct clone_args, set_tid_size) != CLONE_ARGS_SIZE_VER1");
| _Static_assert (offsetofend(struct clone_args, cgroup) == CLONE_ARGS_SIZE_VER0,
|		  "offsetofend(struct clone_args, cgroup) != CLONE_ARGS_SIZE_VER0");
| _Static_assert (sizeof(struct clone_args) == CLONE_ARGS_SIZE_VER2,
|		  "sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2");

> +  /* Try clone3 first.  */
> +  int saved_errno = errno;
> +  ret = __clone3 (cl_args, fn, arg);
> +  if (ret != -1 || errno != ENOSYS)
> +    return ret;

It will causa an extra function call for all architectures that do not
proper implement clone3.  Maybe a per-architecture define (HAVE_CLONE3_IMPL
define at sysdep.h) to indicate it implements clone3 (since I assume we
won't add a clone3.S for *all* archictures at same time and it might take
some time until we have support for all architectures):

> +
> +  /* NB: Restore errno since errno may be checked against non-zero
> +     return value.  */
> +  __set_errno (saved_errno);
> +#else
> +    /* Check invalid arguments.  */
> +  if (cl_args == NULL || fn == NULL)
> +    __set_errno (EINVAL);
> +#endif
> +
> +  /* Check invalid clone3 specific bits.  */
> +  if ((cl_args->flags & CSIGNAL) != 0
> +      || (cl_args->flags & __UINT64_C (0xffffffff00000000)))
> +    {
> +      __set_errno (EINVAL);
> +      return -1;
> +    }
> +

Should we mimic the kernel check here? 'clone3' will already return EINVAL
in such case and 'clone' will clear the higher bits.

> +  /* Map clone3 arguments to clone arguments.  */
> +  int flags = cl_args->flags | cl_args->exit_signal;
> +  void *stack = cast_to_pointer (cl_args->stack);
> +
> +#ifdef __ia64__
> +  ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,

I think there is no need of this cast (the __clone2 has a proper prototype).

> +		  flags, arg,
> +		  cast_to_pointer (cl_args->parent_tid),
> +		  cast_to_pointer (cl_args->tls),
> +		  cast_to_pointer (cl_args->child_tid));
> +#else
> +# if !_STACK_GROWS_DOWN && !_STACK_GROWS_UP
> +#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> +# endif
> +
> +# if _STACK_GROWS_DOWN
> +  stack += cl_args->stack_size;
> +# endif
> +  ret = __clone (fn, stack, flags, arg,
> +		 cast_to_pointer (cl_args->parent_tid),
> +		 cast_to_pointer (cl_args->tls),
> +		 cast_to_pointer (cl_args->child_tid));
> +#endif
> +  return ret;
> +}
> +
> +libc_hidden_def (__clone_internal)

Ok.

> diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
> new file mode 100644
> index 0000000000..d767e49fc8
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
> @@ -0,0 +1,5 @@
> +#include <clone3.h>
> +
> +--
> +
> +CLONE_ARGS_SIZE			sizeof (struct clone_args)

Ok.

> diff --git a/sysdeps/unix/sysv/linux/clone3.c b/sysdeps/unix/sysv/linux/clone3.c
> new file mode 100644
> index 0000000000..b7a41f82e8
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone3.c
> @@ -0,0 +1,34 @@
> +/* The clone3 syscall wrapper.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <stddef.h>
> +#include <clone3.h>
> +
> +int
> +__clone3 (struct clone_args *cl_args, int (*fn) (void *arg), void *arg)
> +{
> +  if (cl_args == NULL || fn == NULL)
> +    __set_errno (EINVAL);
> +  else
> +    __set_errno (ENOSYS);
> +  return -1;
> +}
> +
> +libc_hidden_def (__clone3)
> +weak_alias (__clone3, clone3)

Ok, although I think se we are aiming to not export yet I would prefer to
conditionalize clone3 usage for architectures that actually implements
the clone3.S by using a per-architecture define.

I am not sure it would be valuable to export clone3 while providing a
proper implementation only for a handful of architectures.

> diff --git a/sysdeps/unix/sysv/linux/clone3.h b/sysdeps/unix/sysv/linux/clone3.h
> new file mode 100644
> index 0000000000..be8a11a17c
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/clone3.h
> @@ -0,0 +1,58 @@
> +/* The wrapper of clone3.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _CLONE3_H
> +#define _CLONE3_H	1
> +
> +#include <features.h>
> +#include <stdint.h>
> +#define clone_args __redirect_clone_args
> +#include <linux/sched.h>
> +#undef clone_args
I think we include it mainly for the CLONE_ARGS_SIZE_VER0 and the define
is used solely to conditionaly build the clone3 call.  I am not very found 
adding it such requirement, specially on a header that might be aimed to 
be installed (I take it from __BEGIN_DECLS below).

I dont't think we need to conditionalize the clone3 call to the
CLONE_ARGS_SIZE_VER0, specially because the clone_args structure below
seems to cover not only VER0, but also VER1 and VER2.

> +
> +__BEGIN_DECLS
> +
> +struct clone_args
> +{
> +  uint64_t flags;	 /* Flags bit mask.  */

The kernel uses __aligned_u64, which is define as:

include/uapi/linux/types.h
 48 #define __aligned_u64 __u64 __attribute__((aligned(8)))

Although it works on most architectures, uint64_t does not have
alignment of 8 for every supported architecture.  Trying to compiler
the small snippet below:

| #include <stdint.h>
| _Static_assert (_Alignof (uint64_t) == 8, "_Alignof (uint64_t) != 8");

It fails for i686, m68k, microblaze{el}, nios2, and sh4. So I think we 
will need to use '__attribute__((aligned(8)))' on our internal definition
as well.

> +  uint64_t pidfd;	 /* Where to store PID file descriptor
> +			    (pid_t *).  */
> +  uint64_t child_tid;	 /* Where to store child TID, in child's memory
> +			    (pid_t *).  */
> +  uint64_t parent_tid;	 /* Where to store child TID, in parent's memory
> +			    (int *). */
> +  uint64_t exit_signal;	 /* Signal to deliver to parent on child
> +			    termination */
> +  uint64_t stack;	 /* The lowest address of stack.  */
> +  uint64_t stack_size;	 /* Size of stack.  */
> +  uint64_t tls;		 /* Location of new TLS.  */
> +  uint64_t set_tid;	 /* Pointer to a pid_t array
> +			    (since Linux 5.5).  */
> +  uint64_t set_tid_size; /* Number of elements in set_tid
> +			    (since Linux 5.5). */
> +  uint64_t cgroup;	 /* File descriptor for target cgroup
> +			    of child (since Linux 5.7).  */
> +};
> +
> +/* The wrapper of clone3.  */
> +extern int clone3 (struct clone_args *cl_args,
> +		   int (*__fn) (void *__arg), void *__arg);
> +
> +__END_DECLS
> +
> +#endif /* clone3.h */
> diff --git a/sysdeps/unix/sysv/linux/createthread.c b/sysdeps/unix/sysv/linux/createthread.c
> index bc3409b326..2b5e6cb131 100644
> --- a/sysdeps/unix/sysv/linux/createthread.c
> +++ b/sysdeps/unix/sysv/linux/createthread.c
> @@ -18,6 +18,7 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <sched.h>
> +#include <clone3.h>
>  #include <setjmp.h>
>  #include <signal.h>
>  #include <stdlib.h>
> @@ -28,12 +29,6 @@
>  
>  #include <arch-fork.h>
>  
> -#ifdef __NR_clone2
> -# define ARCH_CLONE __clone2
> -#else
> -# define ARCH_CLONE __clone
> -#endif
> -
>  /* See the comments in pthread_create.c for the requirements for these
>     two macros and the create_thread function.  */
>  

Ok.

> @@ -47,7 +42,8 @@ static int start_thread (void *arg) __attribute__ ((noreturn));
>  
>  static int
>  create_thread (struct pthread *pd, const struct pthread_attr *attr,
> -	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
> +	       bool *stopped_start, void *stackaddr, size_t stacksize,
> +	       bool *thread_ran)
>  {
>    /* Determine whether the newly created threads has to be started
>       stopped since we have to set the scheduling parameters or set the
> @@ -100,9 +96,18 @@ create_thread (struct pthread *pd, const struct pthread_attr *attr,
>  
>    TLS_DEFINE_INIT_TP (tp, pd);
>  
> -  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
> -				    clone_flags, pd, &pd->tid, tp, &pd->tid)
> -			== -1))
> +  struct clone_args args =
> +    {
> +      .flags = clone_flags,
> +      .pidfd = (uintptr_t) &pd->tid,
> +      .parent_tid = (uintptr_t) &pd->tid,
> +      .child_tid = (uintptr_t) &pd->tid,
> +      .stack = (uintptr_t) stackaddr,
> +      .stack_size = stacksize,
> +      .tls = (uintptr_t) tp,
> +    };
> +  int ret = __clone_internal (&args, &start_thread, pd);
> +  if (__glibc_unlikely (ret == -1))
>      return errno;
>  
>    /* It's started now, so if we fail below, we'll have to cancel it

Ok.

> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
> index 501f8fbccd..92b61e55d3 100644
> --- a/sysdeps/unix/sysv/linux/spawni.c
> +++ b/sysdeps/unix/sysv/linux/spawni.c
> @@ -31,6 +31,7 @@
>  #include <dl-sysdep.h>
>  #include <libc-pointer-arith.h>
>  #include <ldsodefs.h>
> +#include <clone3.h>
>  #include "spawn_int.h"
>  
>  /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
> @@ -59,21 +60,6 @@
>     normal program exit with the exit code 127.  */
>  #define SPAWN_ERROR	127
>  
> -#ifdef __ia64__
> -# define CLONE(__fn, __stackbase, __stacksize, __flags, __args) \
> -  __clone2 (__fn, __stackbase, __stacksize, __flags, __args, 0, 0, 0)
> -#else
> -# define CLONE(__fn, __stack, __stacksize, __flags, __args) \
> -  __clone (__fn, __stack, __flags, __args)
> -#endif
> -
> -/* Since ia64 wants the stackbase w/clone2, re-use the grows-up macro.  */
> -#if _STACK_GROWS_UP || defined (__ia64__)
> -# define STACK(__stack, __stack_size) (__stack)
> -#elif _STACK_GROWS_DOWN
> -# define STACK(__stack, __stack_size) (__stack + __stack_size)
> -#endif
> -
>  
>  struct posix_spawn_args
>  {
> @@ -378,8 +364,14 @@ __spawnix (pid_t * pid, const char *file,
>       need for CLONE_SETTLS.  Although parent and child share the same TLS
>       namespace, there will be no concurrent access for TLS variables (errno
>       for instance).  */
> -  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
> -		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
> +  struct clone_args clone_args =
> +    {
> +      .flags = CLONE_VM | CLONE_VFORK,
> +      .exit_signal = SIGCHLD,
> +      .stack = (uintptr_t) stack,
> +      .stack_size = stack_size,
> +    };
> +  new_pid = __clone_internal (&clone_args, __spawni_child, &args);
>  
>    /* It needs to collect the case where the auxiliary process was created
>       but failed to execute the file (due either any preparation step or

Ok.

> diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
> index 6ace61bac3..5a463e7af3 100644
> --- a/sysdeps/unix/sysv/linux/tst-align-clone.c
> +++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
> @@ -16,6 +16,7 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <sched.h>
> +#include <clone3.h>
>  #include <stdbool.h>
>  #include <stdint.h>
>  #include <stdio.h>
> @@ -39,7 +40,7 @@ f (void *arg)
>  }
>  
>  static int
> -do_test (void)
> +do_test_clone (bool use_clone_internal)
>  {
>    bool ok = true;
>  
> @@ -49,21 +50,41 @@ do_test (void)
>      ok = false;
>  
>  #ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  char st[256 * 1024];
> -  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
> +# define STACK_SIZE 256 * 1024
> +#else
> +# define STACK_SIZE 128 * 1024
> +#endif
> +  char st[STACK_SIZE] __attribute__ ((aligned));
> +
> +  pid_t p;
> +
> +  if (use_clone_internal)
> +    {
> +      struct clone_args clone_args =
> +	{
> +	  .stack = (uintptr_t) st,
> +	  .stack_size = sizeof (st),
> +	};
> +      p = __clone_internal (&clone_args, f, 0);

It only works because it is linking against to __clone_interal@@GLIBC_PRIVATE,
which I don't think it a good way to check for internal interfaces. 

I would rather add a test-internal to check for __clone_internal usage.

> +    }
> +  else
> +    {
> +#ifdef __ia64__
> +      extern int __clone2 (int (*__fn) (void *__arg),
> +			   void *__child_stack_base,
> +			   size_t __child_stack_size, int __flags,
> +			   void *__arg, ...);
> +      p = __clone2 (f, st, sizeof (st), 0, 0);
>  #else
> -  char st[128 * 1024] __attribute__ ((aligned));
>  # if _STACK_GROWS_DOWN
> -  pid_t p = clone (f, st + sizeof (st), 0, 0);
> +      p = clone (f, st + sizeof (st), 0, 0);
>  # elif _STACK_GROWS_UP
> -  pid_t p = clone (f, st, 0, 0);
> +      p = clone (f, st, 0, 0);
>  # else
>  #  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
>  # endif
>  #endif
> +    }
>    if (p == -1)
>      {
>        printf("clone failed: %m\n");
> @@ -91,5 +112,21 @@ do_test (void)
>    return ok ? 0 : 1;
>  }
>  
> +static int
> +do_test (void)
> +{
> +  bool ok = true;
> +
> +  puts ("in main");
> +
> +  if (do_test_clone (false) != 0)
> +    ok = false;
> +
> +  if (do_test_clone (true) != 0)
> +    ok = false;
> +
> +  return ok ? 0 : 1;
> +}
> +
>  #define TEST_FUNCTION do_test ()
>  #include "../test-skeleton.c"

Ok.

> diff --git a/sysdeps/unix/sysv/linux/tst-clone.c b/sysdeps/unix/sysv/linux/tst-clone.c
> index e6ae0106ef..855f48121e 100644
> --- a/sysdeps/unix/sysv/linux/tst-clone.c
> +++ b/sysdeps/unix/sysv/linux/tst-clone.c
> @@ -22,6 +22,8 @@
>  #include <stdlib.h>
>  #include <unistd.h>
>  #include <sched.h>
> +#include <clone3.h>
> +#include <stdbool.h>
>  
>  #ifdef __ia64__
>  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> @@ -35,14 +37,17 @@ int child_fn(void *arg)
>  }
>  
>  static int
> -do_test (void)
> +do_test_clone (bool use_clone_internal)
>  {
>    int result;
>  
> +  if (use_clone_internal)
> +    result = __clone_internal (NULL, child_fn, NULL);

Same as before.

> +  else
>  #ifdef __ia64__
> -  result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
> +    result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
>  #else
> -  result = clone (child_fn, NULL, 0, NULL);
> +    result = clone (child_fn, NULL, 0, NULL);
>  #endif
>  
>    if (errno != EINVAL || result != -1)
> @@ -52,6 +57,18 @@ do_test (void)
>        return 1;
>      }
>  
> +  return 0;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  if (do_test_clone (false) != 0)
> +    return 1;
> +
> +  if (do_test_clone (true) != 0)
> +    return 1;
> +
>    puts ("All OK");
>    return 0;
>  }
> diff --git a/sysdeps/unix/sysv/linux/tst-clone2.c b/sysdeps/unix/sysv/linux/tst-clone2.c
> index ce36c70c0d..1df336f8c6 100644
> --- a/sysdeps/unix/sysv/linux/tst-clone2.c
> +++ b/sysdeps/unix/sysv/linux/tst-clone2.c
> @@ -17,6 +17,7 @@
>     <https://www.gnu.org/licenses/>.  */
>  
>  #include <sched.h>
> +#include <clone3.h>
>  #include <signal.h>
>  #include <string.h>
>  #include <stdio.h>
> @@ -58,7 +59,7 @@ f (void *a)
>  
>  
>  static int
> -do_test (void)
> +do_test_clone (bool use_clone_internal)
>  {
>    sig = SIGRTMIN;
>    sigset_t ss;
> @@ -70,23 +71,43 @@ do_test (void)
>    if (pipe2 (pipefd, O_CLOEXEC))
>      FAIL_EXIT1 ("pipe failed: %m");
>  
> -  int clone_flags = 0;
>  #ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  char st[256 * 1024] __attribute__ ((aligned));
> -  pid_t p = __clone2 (f, st, sizeof (st), clone_flags, 0);
> -#else
> -  char st[128 * 1024] __attribute__ ((aligned));
> -#if _STACK_GROWS_DOWN
> -  pid_t p = clone (f, st + sizeof (st), clone_flags, 0);
> -#elif _STACK_GROWS_UP
> -  pid_t p = clone (f, st, clone_flags, 0);
> +# define STACK_SIZE 256 * 1024
>  #else
> -#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> +# define STACK_SIZE 128 * 1024
>  #endif
> +  char st[STACK_SIZE] __attribute__ ((aligned));
> +
> +  pid_t p;
> +
> +  if (use_clone_internal)
> +    {
> +      struct clone_args clone_args =
> +	{
> +	  .stack = (uintptr_t) st,
> +	  .stack_size = sizeof (st),
> +	};
> +      p = __clone_internal (&clone_args, f, 0);

Same as before.

> +    }
> +  else
> +    {
> +      int clone_flags = 0;
> +#ifdef __ia64__
> +      extern int __clone2 (int (*__fn) (void *__arg),
> +			   void *__child_stack_base,
> +			   size_t __child_stack_size, int __flags,
> +			   void *__arg, ...);
> +      p = __clone2 (f, st, sizeof (st), clone_flags, 0);
> +#else
> +# if _STACK_GROWS_DOWN
> +      p = clone (f, st + sizeof (st), clone_flags, 0);
> +# elif _STACK_GROWS_UP
> +      p = clone (f, st, clone_flags, 0);
> +# else
> +#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> +# endif
>  #endif
> +    }
>  
>    close (pipefd[1]);
>  
> @@ -143,4 +164,16 @@ do_test (void)
>    return ret;
>  }
>  
> +static int
> +do_test (void)
> +{
> +  if (do_test_clone (false) != 0)
> +    return 1;
> +
> +  if (do_test_clone (true) != 0)
> +    return 1;
> +
> +  return 0;
> +}
> +
>  #include <support/test-driver.c>
> diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c
> index 1a1bfd4586..f4eabfc5a7 100644
> --- a/sysdeps/unix/sysv/linux/tst-clone3.c
> +++ b/sysdeps/unix/sysv/linux/tst-clone3.c
> @@ -18,6 +18,8 @@
>  
>  #include <string.h>
>  #include <sched.h>
> +#include <clone3.h>
> +#include <stdbool.h>
>  #include <signal.h>
>  #include <unistd.h>
>  #include <sys/syscall.h>
> @@ -64,7 +66,7 @@ futex_wait (int *futexp, int val)
>  }
>  
>  static int
> -do_test (void)
> +do_test_clone (bool use_clone_internal)
>  {
>    char st[1024] __attribute__ ((aligned));
>    int clone_flags = CLONE_THREAD;
> @@ -78,23 +80,39 @@ do_test (void)
>    pid_t ctid = CTID_INIT_VAL;
>    pid_t tid;
>  
> +  if (use_clone_internal)
> +    {
> +      struct clone_args clone_args =
> +	{
> +	  .flags = clone_flags & ~CSIGNAL,
> +	  .exit_signal = clone_flags & CSIGNAL,
> +	  .stack = (uintptr_t) st,
> +	  .stack_size = sizeof (st),
> +	  .child_tid = (uintptr_t) &ctid,
> +	};
> +      tid = __clone_internal (&clone_args, f, NULL);

Same as before.

> +    }
> +  else
> +    {
>  #ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> -		  /* tls */ NULL, &ctid);
> -#else
> -#if _STACK_GROWS_DOWN
> -  tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> -	       /* tls */ NULL, &ctid);
> -#elif _STACK_GROWS_UP
> -  tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
> -	       &ctid);
> +      extern int __clone2 (int (*__fn) (void *__arg),
> +			   void *__child_stack_base,
> +			   size_t __child_stack_size, int __flags,
> +			   void *__arg, ...);
> +      tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> +		      /* tls */ NULL, &ctid);
>  #else
> -#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> -#endif
> +# if _STACK_GROWS_DOWN
> +      tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> +		   /* tls */ NULL, &ctid);
> +# elif _STACK_GROWS_UP
> +      tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
> +		   &ctid);
> +# else
> +#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> +# endif
>  #endif
> +    }
>    if (tid == -1)
>      FAIL_EXIT1 ("clone failed: %m");
>  
> @@ -103,5 +121,17 @@ do_test (void)
>    return 2;
>  }
>  
> +static int
> +do_test (void)
> +{
> +  if (do_test_clone (false) != 2)
> +    return 1;
> +
> +  if (do_test_clone (true) != 2)
> +    return 1;
> +
> +  return 2;
> +}
> +
>  #define EXPECTED_STATUS 2
>  #include <support/test-driver.c>
> diff --git a/sysdeps/unix/sysv/linux/tst-getpid1.c b/sysdeps/unix/sysv/linux/tst-getpid1.c
> index 253ebf2e15..1528743319 100644
> --- a/sysdeps/unix/sysv/linux/tst-getpid1.c
> +++ b/sysdeps/unix/sysv/linux/tst-getpid1.c
> @@ -1,4 +1,6 @@
>  #include <sched.h>
> +#include <clone3.h>
> +#include <stdbool.h>
>  #include <signal.h>
>  #include <string.h>
>  #include <stdio.h>
> @@ -27,7 +29,7 @@ f (void *a)
>  
>  
>  static int
> -do_test (void)
> +do_test_clone (bool use_clone_internal)
>  {
>    int mypid = getpid ();
>  
> @@ -42,21 +44,43 @@ do_test (void)
>      }
>  
>  #ifdef __ia64__
> -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> -		       size_t __child_stack_size, int __flags,
> -		       void *__arg, ...);
> -  char st[256 * 1024] __attribute__ ((aligned));
> -  pid_t p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
> +# define STACK_SIZE 256 * 1024
> +#else
> +# define STACK_SIZE 128 * 1024
> +#endif
> +  char st[STACK_SIZE] __attribute__ ((aligned));
> +
> +  pid_t p;
> +
> +  if (use_clone_internal)
> +    {
> +      struct clone_args clone_args =
> +	{
> +	  .flags = TEST_CLONE_FLAGS & ~CSIGNAL,
> +	  .exit_signal = TEST_CLONE_FLAGS & CSIGNAL,
> +	  .stack = (uintptr_t) st,
> +	  .stack_size = sizeof (st),
> +	};
> +      p = __clone_internal (&clone_args, f, 0);

Same as before.

> +    }
> +  else
> +    {
> +#ifdef __ia64__
> +      extern int __clone2 (int (*__fn) (void *__arg),
> +			   void *__child_stack_base,
> +			   size_t __child_stack_size, int __flags,
> +			   void *__arg, ...);
> +      p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
>  #else
> -  char st[128 * 1024] __attribute__ ((aligned));
>  # if _STACK_GROWS_DOWN
> -  pid_t p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
> +      p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
>  # elif _STACK_GROWS_UP
> -  pid_t p = clone (f, st, TEST_CLONE_FLAGS, 0);
> +      p = clone (f, st, TEST_CLONE_FLAGS, 0);
>  # else
>  #  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
>  # endif
>  #endif
> +    }
>    if (p == -1)
>      {
>        printf("clone failed: %m\n");
> @@ -118,5 +142,17 @@ do_test (void)
>    return 0;
>  }
>  
> +static int
> +do_test (void)
> +{
> +  if (do_test_clone (false) != 0)
> +    return 1;
> +
> +  if (do_test_clone (true) != 0)
> +    return 1;
> +
> +  return 0;
> +}
> +
>  #define TEST_FUNCTION do_test ()
>  #include "../test-skeleton.c"
> diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> new file mode 100644
> index 0000000000..8a7a9a7478
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> @@ -0,0 +1,92 @@
> +/* The clone3 syscall wrapper.  Linux/x86-64 version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* clone3() is even more special than fork() as it mucks with stacks
> +   and invokes a function in the right context after its all over.  */
> +
> +#include <sysdep.h>
> +#include <clone-offsets.h>
> +
> +/* The userland implementation is:
> +   int clone3 (struct clone_args *cl_args, int (*fn)(void *arg),
> +	       void *arg);
> +   the kernel entry is:
> +   int clone3 (struct clone_args *cl_args, size_t size);
> +
> +   The parameters are passed in registers from userland:
> +   rdi: cl_args
> +   rsi: fn
> +   rdx: arg
> +
> +   The kernel expects:
> +   rax: system call number
> +   rdi: cl_args
> +   rsi: size  */
> +
> +        .text
> +ENTRY (__clone3)
> +	/* Sanity check arguments.  */
> +	movq	$-EINVAL, %rax
> +	testq	%rdi, %rdi		/* No NULL cl_args pointer.  */
> +	jz	SYSCALL_ERROR_LABEL
> +	testq	%rsi, %rsi		/* No NULL function pointer.  */
> +	jz	SYSCALL_ERROR_LABEL
> +
> +	/* Save the function pointer in R8 which is preserved by the
> +	   syscall.  */
> +	movq	%rsi, %r8
> +
> +	/* Put sizeof (struct clone_args) in ESI.  */
> +	movl	$CLONE_ARGS_SIZE , %esi
> +
> +	/* Do the system call.  */
> +	movl	$SYS_ify(clone3), %eax
> +
> +	/* End FDE now, because in the child the unwind info will be
> +	   wrong.  */
> +	cfi_endproc
> +	syscall
> +
> +	test	%RAX_LP, %RAX_LP
> +	jl	SYSCALL_ERROR_LABEL
> +	jz	L(thread_start)
> +
> +	ret
> +
> +L(thread_start):
> +	cfi_startproc
> +	/* Clearing frame pointer is insufficient, use CFI.  */
> +	cfi_undefined (rip)
> +	/* Clear the frame pointer.  The ABI suggests this be done, to mark
> +	   the outermost frame obviously.  */
> +	xorl	%ebp, %ebp
> +
> +	/* Set up arguments for the function call.  */
> +	movq	%rdx, %rdi	/* Argument.  */
> +	call	*%r8		/* Call function.  */
> +	/* Call exit with return value from function call. */
> +	movq	%rax, %rdi
> +	movl	$SYS_ify(exit), %eax
> +	syscall
> +	cfi_endproc
> +
> +	cfi_startproc
> +PSEUDO_END (__clone3)
> +
> +libc_hidden_def (__clone3)
> +weak_alias (__clone3, clone3)
>
H.J. Lu May 15, 2021, 12:24 p.m. UTC | #2
On Thu, May 13, 2021 at 7:51 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 10/05/2021 22:43, H.J. Lu via Libc-alpha wrote:
> > The clone3 system call provides a superset of the functionality of clone
> > and clone2.  It also provides a number of API improve ments, including
> > the ability to specify the size of the child's stack area which can be
> > used by kernel to compute the shadow stack size when allocating the
> > shadow stack.  Add:
> >
> > extern int __clone_internal (struct clone_args *cl_args,
> >                            int (*__fn) (void *__arg), void *__arg);
> >
> > to provide an abstract interface for clone, clone2 and clone3.
> >
> > 1. Add cast_to_pointer to cast an integer to void * pointer.
> > 2. Simplify stack management for thread creation by passing both stack
> > base and size to create_thread.
> > 3. Consolidate clone vs clone2 differences into a single file.
> > 4. Use only __clone_internal to clone a thread.  If clone3 returns -1
> > with ENOSYS, __clone_internal will fall back to clone or clone2.
> > 5. Add the x86-64 clone3 wrapper.
> > 6. Enable the public clone3 wrapper in the future after it has been
> > added to all targets.
> >
>
> This patch is doing both a refactoring to internal clone usage and
> adding the clone3.  I think it would be better to split two patches:
> changing the internal clone calls to use the clone_internal and then
> add clone3 support along with x86_64 support.

Fixed.

> There are some comments below.
>
> > Tested with build-many-glibcs.py.  The x86-64 clone3 wrapper and the
> > i686 clone fallback are tested under 5.12.2 kernel.
> > ---
> >  include/clone3.h                          | 14 ++++
> >  include/libc-pointer-arith.h              |  3 +
> >  sysdeps/unix/sysv/linux/Makefile          |  4 +-
> >  sysdeps/unix/sysv/linux/Versions          |  1 +
> >  sysdeps/unix/sysv/linux/clone-internal.c  | 80 ++++++++++++++++++++
> >  sysdeps/unix/sysv/linux/clone-offsets.sym |  5 ++
> >  sysdeps/unix/sysv/linux/clone3.c          | 34 +++++++++
> >  sysdeps/unix/sysv/linux/clone3.h          | 58 ++++++++++++++
> >  sysdeps/unix/sysv/linux/createthread.c    | 25 +++---
> >  sysdeps/unix/sysv/linux/spawni.c          | 26 +++----
> >  sysdeps/unix/sysv/linux/tst-align-clone.c | 55 +++++++++++---
> >  sysdeps/unix/sysv/linux/tst-clone.c       | 23 +++++-
> >  sysdeps/unix/sysv/linux/tst-clone2.c      | 61 +++++++++++----
> >  sysdeps/unix/sysv/linux/tst-clone3.c      | 60 +++++++++++----
> >  sysdeps/unix/sysv/linux/tst-getpid1.c     | 54 ++++++++++---
> >  sysdeps/unix/sysv/linux/x86_64/clone3.S   | 92 +++++++++++++++++++++++
> >  16 files changed, 517 insertions(+), 78 deletions(-)
> >  create mode 100644 include/clone3.h
> >  create mode 100644 sysdeps/unix/sysv/linux/clone-internal.c
> >  create mode 100644 sysdeps/unix/sysv/linux/clone-offsets.sym
> >  create mode 100644 sysdeps/unix/sysv/linux/clone3.c
> >  create mode 100644 sysdeps/unix/sysv/linux/clone3.h
> >  create mode 100644 sysdeps/unix/sysv/linux/x86_64/clone3.S
> >
> > diff --git a/include/clone3.h b/include/clone3.h
> > new file mode 100644
> > index 0000000000..124f7ba169
> > --- /dev/null
> > +++ b/include/clone3.h
> > @@ -0,0 +1,14 @@
> > +#ifndef _CLONE3_H
> > +#include_next <clone3.h>
> > +
> > +extern __typeof (clone3) __clone3;
> > +
> > +/* The internal wrapper of clone and clone3.  */
> > +extern __typeof (clone3) __clone_internal;
> > +
> > +#ifndef _ISOMAC
> > +libc_hidden_proto (__clone3)
> > +libc_hidden_proto (__clone_internal)
> > +#endif
> > +
> > +#endif
>
> I would use clone_internal.h here (clone3.h is currently .

Fixed.

> > diff --git a/include/libc-pointer-arith.h b/include/libc-pointer-arith.h
> > index 72e722c5aa..04ba537617 100644
> > --- a/include/libc-pointer-arith.h
> > +++ b/include/libc-pointer-arith.h
> > @@ -37,6 +37,9 @@
> >  /* Cast an integer or a pointer VAL to integer with proper type.  */
> >  # define cast_to_integer(val) ((__integer_if_pointer_type (val)) (val))
> >
> > +/* Cast an integer VAL to void * pointer.  */
> > +# define cast_to_pointer(val) ((void *) (uintptr_t) (val))
> > +
> >  /* Align a value by rounding down to closest size.
> >     e.g. Using size of 4096, we get this behavior:
> >       {4095, 4096, 4097} = {0, 4096, 4096}.  */
>
> Ok.
>
> > diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> > index fb155cf856..7c1f32b84d 100644
> > --- a/sysdeps/unix/sysv/linux/Makefile
> > +++ b/sysdeps/unix/sysv/linux/Makefile
> > @@ -54,6 +54,8 @@ CFLAGS-malloc.c += -DMORECORE_CLEARS=2
> >  endif
> >
> >  ifeq ($(subdir),misc)
> > +gen-as-const-headers += clone-offsets.sym
> > +
> >  sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
> >                  setfsuid setfsgid epoll_pwait signalfd \
> >                  eventfd eventfd_read eventfd_write prlimit \
> > @@ -64,7 +66,7 @@ sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
> >                  time64-support pselect32 \
> >                  xstat fxstat lxstat xstat64 fxstat64 lxstat64 \
> >                  fxstatat fxstatat64 \
> > -                xmknod xmknodat
> > +                xmknod xmknodat clone3 clone-internal
> >
> >  CFLAGS-gethostid.c = -fexceptions
> >  CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
>
> Ok.
>
> > diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
> > index 220bb2dffe..299d4fef9c 100644
> > --- a/sysdeps/unix/sysv/linux/Versions
> > +++ b/sysdeps/unix/sysv/linux/Versions
> > @@ -179,6 +179,7 @@ libc {
> >      __sigtimedwait;
> >      # functions used by nscd
> >      __netlink_assert_response;
> > +    __clone_internal;
> >    }
> >  }
> >
>
> Ok, it is used by pthread_create.
>
> > diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
> > new file mode 100644
> > index 0000000000..acafb78d81
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/clone-internal.c
> > @@ -0,0 +1,80 @@
> > +/* The internal wrapper of clone and clone3.
> > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <errno.h>
> > +#include <sched.h>
> > +#include <clone3.h>
> > +#include <libc-pointer-arith.h>      /* For cast_to_pointer.  */
> > +#include <stackinfo.h>               /* For _STACK_GROWS_{UP,DOWN}.  */
> > +
> > +int
> > +__clone_internal (struct clone_args *cl_args,
> > +               int (*fn) (void *arg), void *arg)
> > +{
> > +  int ret;
> > +#ifdef CLONE_ARGS_SIZE_VER0
>
> I think we should issue clone3 regardless of CLONE_ARGS_SIZE_VER0 is
> defined (as we do for all other syscalls).  We could use the define to
> add similar checks the kernel does, to assure our defined struct is
> similar to the one expected by the kernel:
>
> | #define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
> | #define offsetofend(TYPE, MEMBER) \
> |         (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
> |
> | #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
> | #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
> | #define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
> |
> | _Static_assert (offsetofend(struct clone_args, tls) == CLONE_ARGS_SIZE_VER0,
> |                 "offsetofend(struct clone_args, tls) != CLONE_ARGS_SIZE_VER0");
> | _Static_assert (offsetofend(struct clone_args, set_tid_size) == CLONE_ARGS_SIZE_VER1,
> |                 "offsetofend(struct clone_args, set_tid_size) != CLONE_ARGS_SIZE_VER1");
> | _Static_assert (offsetofend(struct clone_args, cgroup) == CLONE_ARGS_SIZE_VER0,
> |                 "offsetofend(struct clone_args, cgroup) != CLONE_ARGS_SIZE_VER0");
> | _Static_assert (sizeof(struct clone_args) == CLONE_ARGS_SIZE_VER2,
> |                 "sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2");

Fixed.

> > +  /* Try clone3 first.  */
> > +  int saved_errno = errno;
> > +  ret = __clone3 (cl_args, fn, arg);
> > +  if (ret != -1 || errno != ENOSYS)
> > +    return ret;
>
> It will causa an extra function call for all architectures that do not
> proper implement clone3.  Maybe a per-architecture define (HAVE_CLONE3_IMPL
> define at sysdep.h) to indicate it implements clone3 (since I assume we
> won't add a clone3.S for *all* archictures at same time and it might take
> some time until we have support for all architectures):

Fixed.

> > +
> > +  /* NB: Restore errno since errno may be checked against non-zero
> > +     return value.  */
> > +  __set_errno (saved_errno);
> > +#else
> > +    /* Check invalid arguments.  */
> > +  if (cl_args == NULL || fn == NULL)
> > +    __set_errno (EINVAL);
> > +#endif
> > +
> > +  /* Check invalid clone3 specific bits.  */
> > +  if ((cl_args->flags & CSIGNAL) != 0
> > +      || (cl_args->flags & __UINT64_C (0xffffffff00000000)))
> > +    {
> > +      __set_errno (EINVAL);
> > +      return -1;
> > +    }
> > +
>
> Should we mimic the kernel check here? 'clone3' will already return EINVAL
> in such case and 'clone' will clear the higher bits.

Fixed.

> > +  /* Map clone3 arguments to clone arguments.  */
> > +  int flags = cl_args->flags | cl_args->exit_signal;
> > +  void *stack = cast_to_pointer (cl_args->stack);
> > +
> > +#ifdef __ia64__
> > +  ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
>
> I think there is no need of this cast (the __clone2 has a proper prototype).

Fixed.

> > +               flags, arg,
> > +               cast_to_pointer (cl_args->parent_tid),
> > +               cast_to_pointer (cl_args->tls),
> > +               cast_to_pointer (cl_args->child_tid));
> > +#else
> > +# if !_STACK_GROWS_DOWN && !_STACK_GROWS_UP
> > +#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> > +# endif
> > +
> > +# if _STACK_GROWS_DOWN
> > +  stack += cl_args->stack_size;
> > +# endif
> > +  ret = __clone (fn, stack, flags, arg,
> > +              cast_to_pointer (cl_args->parent_tid),
> > +              cast_to_pointer (cl_args->tls),
> > +              cast_to_pointer (cl_args->child_tid));
> > +#endif
> > +  return ret;
> > +}
> > +
> > +libc_hidden_def (__clone_internal)
>
> Ok.
>
> > diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
> > new file mode 100644
> > index 0000000000..d767e49fc8
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
> > @@ -0,0 +1,5 @@
> > +#include <clone3.h>
> > +
> > +--
> > +
> > +CLONE_ARGS_SIZE                      sizeof (struct clone_args)
>
> Ok.
>
> > diff --git a/sysdeps/unix/sysv/linux/clone3.c b/sysdeps/unix/sysv/linux/clone3.c
> > new file mode 100644
> > index 0000000000..b7a41f82e8
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/clone3.c
> > @@ -0,0 +1,34 @@
> > +/* The clone3 syscall wrapper.
> > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <errno.h>
> > +#include <stddef.h>
> > +#include <clone3.h>
> > +
> > +int
> > +__clone3 (struct clone_args *cl_args, int (*fn) (void *arg), void *arg)
> > +{
> > +  if (cl_args == NULL || fn == NULL)
> > +    __set_errno (EINVAL);
> > +  else
> > +    __set_errno (ENOSYS);
> > +  return -1;
> > +}
> > +
> > +libc_hidden_def (__clone3)
> > +weak_alias (__clone3, clone3)
>
> Ok, although I think se we are aiming to not export yet I would prefer to
> conditionalize clone3 usage for architectures that actually implements
> the clone3.S by using a per-architecture define.
>
> I am not sure it would be valuable to export clone3 while providing a
> proper implementation only for a handful of architectures.

I changed clone3.c to a single line:

/* An empty placeholder.  */

> > diff --git a/sysdeps/unix/sysv/linux/clone3.h b/sysdeps/unix/sysv/linux/clone3.h
> > new file mode 100644
> > index 0000000000..be8a11a17c
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/clone3.h
> > @@ -0,0 +1,58 @@
> > +/* The wrapper of clone3.
> > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library.  If not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef _CLONE3_H
> > +#define _CLONE3_H    1
> > +
> > +#include <features.h>
> > +#include <stdint.h>
> > +#define clone_args __redirect_clone_args
> > +#include <linux/sched.h>
> > +#undef clone_args
> I think we include it mainly for the CLONE_ARGS_SIZE_VER0 and the define
> is used solely to conditionaly build the clone3 call.  I am not very found
> adding it such requirement, specially on a header that might be aimed to
> be installed (I take it from __BEGIN_DECLS below).
>
> I dont't think we need to conditionalize the clone3 call to the
> CLONE_ARGS_SIZE_VER0, specially because the clone_args structure below
> seems to cover not only VER0, but also VER1 and VER2.

Fixed.

> > +
> > +__BEGIN_DECLS
> > +
> > +struct clone_args
> > +{
> > +  uint64_t flags;     /* Flags bit mask.  */
>
> The kernel uses __aligned_u64, which is define as:
>
> include/uapi/linux/types.h
>  48 #define __aligned_u64 __u64 __attribute__((aligned(8)))
>
> Although it works on most architectures, uint64_t does not have
> alignment of 8 for every supported architecture.  Trying to compiler
> the small snippet below:
>
> | #include <stdint.h>
> | _Static_assert (_Alignof (uint64_t) == 8, "_Alignof (uint64_t) != 8");
>
> It fails for i686, m68k, microblaze{el}, nios2, and sh4. So I think we
> will need to use '__attribute__((aligned(8)))' on our internal definition
> as well.

I added __attribute__((aligned(8))) to struct clone_args.  Since each
field is 8-byte aligned, it is equivalent to align each field to 8 bytes.
I added:

_Static_assert (__alignof(struct clone_args) == 8,
"__alignof(struct clone_args) != 8");

to verify it.

> > +  uint64_t pidfd;     /* Where to store PID file descriptor
> > +                         (pid_t *).  */
> > +  uint64_t child_tid;         /* Where to store child TID, in child's memory
> > +                         (pid_t *).  */
> > +  uint64_t parent_tid;        /* Where to store child TID, in parent's memory
> > +                         (int *). */
> > +  uint64_t exit_signal;       /* Signal to deliver to parent on child
> > +                         termination */
> > +  uint64_t stack;     /* The lowest address of stack.  */
> > +  uint64_t stack_size;        /* Size of stack.  */
> > +  uint64_t tls;               /* Location of new TLS.  */
> > +  uint64_t set_tid;   /* Pointer to a pid_t array
> > +                         (since Linux 5.5).  */
> > +  uint64_t set_tid_size; /* Number of elements in set_tid
> > +                         (since Linux 5.5). */
> > +  uint64_t cgroup;    /* File descriptor for target cgroup
> > +                         of child (since Linux 5.7).  */
> > +};
> > +
> > +/* The wrapper of clone3.  */
> > +extern int clone3 (struct clone_args *cl_args,
> > +                int (*__fn) (void *__arg), void *__arg);
> > +
> > +__END_DECLS
> > +
> > +#endif /* clone3.h */
> > diff --git a/sysdeps/unix/sysv/linux/createthread.c b/sysdeps/unix/sysv/linux/createthread.c
> > index bc3409b326..2b5e6cb131 100644
> > --- a/sysdeps/unix/sysv/linux/createthread.c
> > +++ b/sysdeps/unix/sysv/linux/createthread.c
> > @@ -18,6 +18,7 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #include <sched.h>
> > +#include <clone3.h>
> >  #include <setjmp.h>
> >  #include <signal.h>
> >  #include <stdlib.h>
> > @@ -28,12 +29,6 @@
> >
> >  #include <arch-fork.h>
> >
> > -#ifdef __NR_clone2
> > -# define ARCH_CLONE __clone2
> > -#else
> > -# define ARCH_CLONE __clone
> > -#endif
> > -
> >  /* See the comments in pthread_create.c for the requirements for these
> >     two macros and the create_thread function.  */
> >
>
> Ok.
>
> > @@ -47,7 +42,8 @@ static int start_thread (void *arg) __attribute__ ((noreturn));
> >
> >  static int
> >  create_thread (struct pthread *pd, const struct pthread_attr *attr,
> > -            bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
> > +            bool *stopped_start, void *stackaddr, size_t stacksize,
> > +            bool *thread_ran)
> >  {
> >    /* Determine whether the newly created threads has to be started
> >       stopped since we have to set the scheduling parameters or set the
> > @@ -100,9 +96,18 @@ create_thread (struct pthread *pd, const struct pthread_attr *attr,
> >
> >    TLS_DEFINE_INIT_TP (tp, pd);
> >
> > -  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
> > -                                 clone_flags, pd, &pd->tid, tp, &pd->tid)
> > -                     == -1))
> > +  struct clone_args args =
> > +    {
> > +      .flags = clone_flags,
> > +      .pidfd = (uintptr_t) &pd->tid,
> > +      .parent_tid = (uintptr_t) &pd->tid,
> > +      .child_tid = (uintptr_t) &pd->tid,
> > +      .stack = (uintptr_t) stackaddr,
> > +      .stack_size = stacksize,
> > +      .tls = (uintptr_t) tp,
> > +    };
> > +  int ret = __clone_internal (&args, &start_thread, pd);
> > +  if (__glibc_unlikely (ret == -1))
> >      return errno;
> >
> >    /* It's started now, so if we fail below, we'll have to cancel it
>
> Ok.
>
> > diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
> > index 501f8fbccd..92b61e55d3 100644
> > --- a/sysdeps/unix/sysv/linux/spawni.c
> > +++ b/sysdeps/unix/sysv/linux/spawni.c
> > @@ -31,6 +31,7 @@
> >  #include <dl-sysdep.h>
> >  #include <libc-pointer-arith.h>
> >  #include <ldsodefs.h>
> > +#include <clone3.h>
> >  #include "spawn_int.h"
> >
> >  /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
> > @@ -59,21 +60,6 @@
> >     normal program exit with the exit code 127.  */
> >  #define SPAWN_ERROR  127
> >
> > -#ifdef __ia64__
> > -# define CLONE(__fn, __stackbase, __stacksize, __flags, __args) \
> > -  __clone2 (__fn, __stackbase, __stacksize, __flags, __args, 0, 0, 0)
> > -#else
> > -# define CLONE(__fn, __stack, __stacksize, __flags, __args) \
> > -  __clone (__fn, __stack, __flags, __args)
> > -#endif
> > -
> > -/* Since ia64 wants the stackbase w/clone2, re-use the grows-up macro.  */
> > -#if _STACK_GROWS_UP || defined (__ia64__)
> > -# define STACK(__stack, __stack_size) (__stack)
> > -#elif _STACK_GROWS_DOWN
> > -# define STACK(__stack, __stack_size) (__stack + __stack_size)
> > -#endif
> > -
> >
> >  struct posix_spawn_args
> >  {
> > @@ -378,8 +364,14 @@ __spawnix (pid_t * pid, const char *file,
> >       need for CLONE_SETTLS.  Although parent and child share the same TLS
> >       namespace, there will be no concurrent access for TLS variables (errno
> >       for instance).  */
> > -  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
> > -                CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
> > +  struct clone_args clone_args =
> > +    {
> > +      .flags = CLONE_VM | CLONE_VFORK,
> > +      .exit_signal = SIGCHLD,
> > +      .stack = (uintptr_t) stack,
> > +      .stack_size = stack_size,
> > +    };
> > +  new_pid = __clone_internal (&clone_args, __spawni_child, &args);
> >
> >    /* It needs to collect the case where the auxiliary process was created
> >       but failed to execute the file (due either any preparation step or
>
> Ok.
>
> > diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
> > index 6ace61bac3..5a463e7af3 100644
> > --- a/sysdeps/unix/sysv/linux/tst-align-clone.c
> > +++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
> > @@ -16,6 +16,7 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #include <sched.h>
> > +#include <clone3.h>
> >  #include <stdbool.h>
> >  #include <stdint.h>
> >  #include <stdio.h>
> > @@ -39,7 +40,7 @@ f (void *arg)
> >  }
> >
> >  static int
> > -do_test (void)
> > +do_test_clone (bool use_clone_internal)
> >  {
> >    bool ok = true;
> >
> > @@ -49,21 +50,41 @@ do_test (void)
> >      ok = false;
> >
> >  #ifdef __ia64__
> > -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> > -                    size_t __child_stack_size, int __flags,
> > -                    void *__arg, ...);
> > -  char st[256 * 1024];
> > -  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
> > +# define STACK_SIZE 256 * 1024
> > +#else
> > +# define STACK_SIZE 128 * 1024
> > +#endif
> > +  char st[STACK_SIZE] __attribute__ ((aligned));
> > +
> > +  pid_t p;
> > +
> > +  if (use_clone_internal)
> > +    {
> > +      struct clone_args clone_args =
> > +     {
> > +       .stack = (uintptr_t) st,
> > +       .stack_size = sizeof (st),
> > +     };
> > +      p = __clone_internal (&clone_args, f, 0);
>
> It only works because it is linking against to __clone_interal@@GLIBC_PRIVATE,
> which I don't think it a good way to check for internal interfaces.
>
> I would rather add a test-internal to check for __clone_internal usage.

Fixed.

> > +    }
> > +  else
> > +    {
> > +#ifdef __ia64__
> > +      extern int __clone2 (int (*__fn) (void *__arg),
> > +                        void *__child_stack_base,
> > +                        size_t __child_stack_size, int __flags,
> > +                        void *__arg, ...);
> > +      p = __clone2 (f, st, sizeof (st), 0, 0);
> >  #else
> > -  char st[128 * 1024] __attribute__ ((aligned));
> >  # if _STACK_GROWS_DOWN
> > -  pid_t p = clone (f, st + sizeof (st), 0, 0);
> > +      p = clone (f, st + sizeof (st), 0, 0);
> >  # elif _STACK_GROWS_UP
> > -  pid_t p = clone (f, st, 0, 0);
> > +      p = clone (f, st, 0, 0);
> >  # else
> >  #  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> >  # endif
> >  #endif
> > +    }
> >    if (p == -1)
> >      {
> >        printf("clone failed: %m\n");
> > @@ -91,5 +112,21 @@ do_test (void)
> >    return ok ? 0 : 1;
> >  }
> >
> > +static int
> > +do_test (void)
> > +{
> > +  bool ok = true;
> > +
> > +  puts ("in main");
> > +
> > +  if (do_test_clone (false) != 0)
> > +    ok = false;
> > +
> > +  if (do_test_clone (true) != 0)
> > +    ok = false;
> > +
> > +  return ok ? 0 : 1;
> > +}
> > +
> >  #define TEST_FUNCTION do_test ()
> >  #include "../test-skeleton.c"
>
> Ok.
>
> > diff --git a/sysdeps/unix/sysv/linux/tst-clone.c b/sysdeps/unix/sysv/linux/tst-clone.c
> > index e6ae0106ef..855f48121e 100644
> > --- a/sysdeps/unix/sysv/linux/tst-clone.c
> > +++ b/sysdeps/unix/sysv/linux/tst-clone.c
> > @@ -22,6 +22,8 @@
> >  #include <stdlib.h>
> >  #include <unistd.h>
> >  #include <sched.h>
> > +#include <clone3.h>
> > +#include <stdbool.h>
> >
> >  #ifdef __ia64__
> >  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> > @@ -35,14 +37,17 @@ int child_fn(void *arg)
> >  }
> >
> >  static int
> > -do_test (void)
> > +do_test_clone (bool use_clone_internal)
> >  {
> >    int result;
> >
> > +  if (use_clone_internal)
> > +    result = __clone_internal (NULL, child_fn, NULL);
>
> Same as before.
>
> > +  else
> >  #ifdef __ia64__
> > -  result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
> > +    result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
> >  #else
> > -  result = clone (child_fn, NULL, 0, NULL);
> > +    result = clone (child_fn, NULL, 0, NULL);
> >  #endif
> >
> >    if (errno != EINVAL || result != -1)
> > @@ -52,6 +57,18 @@ do_test (void)
> >        return 1;
> >      }
> >
> > +  return 0;
> > +}
> > +
> > +static int
> > +do_test (void)
> > +{
> > +  if (do_test_clone (false) != 0)
> > +    return 1;
> > +
> > +  if (do_test_clone (true) != 0)
> > +    return 1;
> > +
> >    puts ("All OK");
> >    return 0;
> >  }
> > diff --git a/sysdeps/unix/sysv/linux/tst-clone2.c b/sysdeps/unix/sysv/linux/tst-clone2.c
> > index ce36c70c0d..1df336f8c6 100644
> > --- a/sysdeps/unix/sysv/linux/tst-clone2.c
> > +++ b/sysdeps/unix/sysv/linux/tst-clone2.c
> > @@ -17,6 +17,7 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #include <sched.h>
> > +#include <clone3.h>
> >  #include <signal.h>
> >  #include <string.h>
> >  #include <stdio.h>
> > @@ -58,7 +59,7 @@ f (void *a)
> >
> >
> >  static int
> > -do_test (void)
> > +do_test_clone (bool use_clone_internal)
> >  {
> >    sig = SIGRTMIN;
> >    sigset_t ss;
> > @@ -70,23 +71,43 @@ do_test (void)
> >    if (pipe2 (pipefd, O_CLOEXEC))
> >      FAIL_EXIT1 ("pipe failed: %m");
> >
> > -  int clone_flags = 0;
> >  #ifdef __ia64__
> > -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> > -                    size_t __child_stack_size, int __flags,
> > -                    void *__arg, ...);
> > -  char st[256 * 1024] __attribute__ ((aligned));
> > -  pid_t p = __clone2 (f, st, sizeof (st), clone_flags, 0);
> > -#else
> > -  char st[128 * 1024] __attribute__ ((aligned));
> > -#if _STACK_GROWS_DOWN
> > -  pid_t p = clone (f, st + sizeof (st), clone_flags, 0);
> > -#elif _STACK_GROWS_UP
> > -  pid_t p = clone (f, st, clone_flags, 0);
> > +# define STACK_SIZE 256 * 1024
> >  #else
> > -#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> > +# define STACK_SIZE 128 * 1024
> >  #endif
> > +  char st[STACK_SIZE] __attribute__ ((aligned));
> > +
> > +  pid_t p;
> > +
> > +  if (use_clone_internal)
> > +    {
> > +      struct clone_args clone_args =
> > +     {
> > +       .stack = (uintptr_t) st,
> > +       .stack_size = sizeof (st),
> > +     };
> > +      p = __clone_internal (&clone_args, f, 0);
>
> Same as before.
>
> > +    }
> > +  else
> > +    {
> > +      int clone_flags = 0;
> > +#ifdef __ia64__
> > +      extern int __clone2 (int (*__fn) (void *__arg),
> > +                        void *__child_stack_base,
> > +                        size_t __child_stack_size, int __flags,
> > +                        void *__arg, ...);
> > +      p = __clone2 (f, st, sizeof (st), clone_flags, 0);
> > +#else
> > +# if _STACK_GROWS_DOWN
> > +      p = clone (f, st + sizeof (st), clone_flags, 0);
> > +# elif _STACK_GROWS_UP
> > +      p = clone (f, st, clone_flags, 0);
> > +# else
> > +#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> > +# endif
> >  #endif
> > +    }
> >
> >    close (pipefd[1]);
> >
> > @@ -143,4 +164,16 @@ do_test (void)
> >    return ret;
> >  }
> >
> > +static int
> > +do_test (void)
> > +{
> > +  if (do_test_clone (false) != 0)
> > +    return 1;
> > +
> > +  if (do_test_clone (true) != 0)
> > +    return 1;
> > +
> > +  return 0;
> > +}
> > +
> >  #include <support/test-driver.c>
> > diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c
> > index 1a1bfd4586..f4eabfc5a7 100644
> > --- a/sysdeps/unix/sysv/linux/tst-clone3.c
> > +++ b/sysdeps/unix/sysv/linux/tst-clone3.c
> > @@ -18,6 +18,8 @@
> >
> >  #include <string.h>
> >  #include <sched.h>
> > +#include <clone3.h>
> > +#include <stdbool.h>
> >  #include <signal.h>
> >  #include <unistd.h>
> >  #include <sys/syscall.h>
> > @@ -64,7 +66,7 @@ futex_wait (int *futexp, int val)
> >  }
> >
> >  static int
> > -do_test (void)
> > +do_test_clone (bool use_clone_internal)
> >  {
> >    char st[1024] __attribute__ ((aligned));
> >    int clone_flags = CLONE_THREAD;
> > @@ -78,23 +80,39 @@ do_test (void)
> >    pid_t ctid = CTID_INIT_VAL;
> >    pid_t tid;
> >
> > +  if (use_clone_internal)
> > +    {
> > +      struct clone_args clone_args =
> > +     {
> > +       .flags = clone_flags & ~CSIGNAL,
> > +       .exit_signal = clone_flags & CSIGNAL,
> > +       .stack = (uintptr_t) st,
> > +       .stack_size = sizeof (st),
> > +       .child_tid = (uintptr_t) &ctid,
> > +     };
> > +      tid = __clone_internal (&clone_args, f, NULL);
>
> Same as before.
>
> > +    }
> > +  else
> > +    {
> >  #ifdef __ia64__
> > -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> > -                    size_t __child_stack_size, int __flags,
> > -                    void *__arg, ...);
> > -  tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> > -               /* tls */ NULL, &ctid);
> > -#else
> > -#if _STACK_GROWS_DOWN
> > -  tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> > -            /* tls */ NULL, &ctid);
> > -#elif _STACK_GROWS_UP
> > -  tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
> > -            &ctid);
> > +      extern int __clone2 (int (*__fn) (void *__arg),
> > +                        void *__child_stack_base,
> > +                        size_t __child_stack_size, int __flags,
> > +                        void *__arg, ...);
> > +      tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> > +                   /* tls */ NULL, &ctid);
> >  #else
> > -#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> > -#endif
> > +# if _STACK_GROWS_DOWN
> > +      tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
> > +                /* tls */ NULL, &ctid);
> > +# elif _STACK_GROWS_UP
> > +      tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
> > +                &ctid);
> > +# else
> > +#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> > +# endif
> >  #endif
> > +    }
> >    if (tid == -1)
> >      FAIL_EXIT1 ("clone failed: %m");
> >
> > @@ -103,5 +121,17 @@ do_test (void)
> >    return 2;
> >  }
> >
> > +static int
> > +do_test (void)
> > +{
> > +  if (do_test_clone (false) != 2)
> > +    return 1;
> > +
> > +  if (do_test_clone (true) != 2)
> > +    return 1;
> > +
> > +  return 2;
> > +}
> > +
> >  #define EXPECTED_STATUS 2
> >  #include <support/test-driver.c>
> > diff --git a/sysdeps/unix/sysv/linux/tst-getpid1.c b/sysdeps/unix/sysv/linux/tst-getpid1.c
> > index 253ebf2e15..1528743319 100644
> > --- a/sysdeps/unix/sysv/linux/tst-getpid1.c
> > +++ b/sysdeps/unix/sysv/linux/tst-getpid1.c
> > @@ -1,4 +1,6 @@
> >  #include <sched.h>
> > +#include <clone3.h>
> > +#include <stdbool.h>
> >  #include <signal.h>
> >  #include <string.h>
> >  #include <stdio.h>
> > @@ -27,7 +29,7 @@ f (void *a)
> >
> >
> >  static int
> > -do_test (void)
> > +do_test_clone (bool use_clone_internal)
> >  {
> >    int mypid = getpid ();
> >
> > @@ -42,21 +44,43 @@ do_test (void)
> >      }
> >
> >  #ifdef __ia64__
> > -  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
> > -                    size_t __child_stack_size, int __flags,
> > -                    void *__arg, ...);
> > -  char st[256 * 1024] __attribute__ ((aligned));
> > -  pid_t p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
> > +# define STACK_SIZE 256 * 1024
> > +#else
> > +# define STACK_SIZE 128 * 1024
> > +#endif
> > +  char st[STACK_SIZE] __attribute__ ((aligned));
> > +
> > +  pid_t p;
> > +
> > +  if (use_clone_internal)
> > +    {
> > +      struct clone_args clone_args =
> > +     {
> > +       .flags = TEST_CLONE_FLAGS & ~CSIGNAL,
> > +       .exit_signal = TEST_CLONE_FLAGS & CSIGNAL,
> > +       .stack = (uintptr_t) st,
> > +       .stack_size = sizeof (st),
> > +     };
> > +      p = __clone_internal (&clone_args, f, 0);
>
> Same as before.
>
> > +    }
> > +  else
> > +    {
> > +#ifdef __ia64__
> > +      extern int __clone2 (int (*__fn) (void *__arg),
> > +                        void *__child_stack_base,
> > +                        size_t __child_stack_size, int __flags,
> > +                        void *__arg, ...);
> > +      p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
> >  #else
> > -  char st[128 * 1024] __attribute__ ((aligned));
> >  # if _STACK_GROWS_DOWN
> > -  pid_t p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
> > +      p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
> >  # elif _STACK_GROWS_UP
> > -  pid_t p = clone (f, st, TEST_CLONE_FLAGS, 0);
> > +      p = clone (f, st, TEST_CLONE_FLAGS, 0);
> >  # else
> >  #  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
> >  # endif
> >  #endif
> > +    }
> >    if (p == -1)
> >      {
> >        printf("clone failed: %m\n");
> > @@ -118,5 +142,17 @@ do_test (void)
> >    return 0;
> >  }
> >
> > +static int
> > +do_test (void)
> > +{
> > +  if (do_test_clone (false) != 0)
> > +    return 1;
> > +
> > +  if (do_test_clone (true) != 0)
> > +    return 1;
> > +
> > +  return 0;
> > +}
> > +
> >  #define TEST_FUNCTION do_test ()
> >  #include "../test-skeleton.c"
> > diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> > new file mode 100644
> > index 0000000000..8a7a9a7478
> > --- /dev/null
> > +++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
> > @@ -0,0 +1,92 @@
> > +/* The clone3 syscall wrapper.  Linux/x86-64 version.
> > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* clone3() is even more special than fork() as it mucks with stacks
> > +   and invokes a function in the right context after its all over.  */
> > +
> > +#include <sysdep.h>
> > +#include <clone-offsets.h>
> > +
> > +/* The userland implementation is:
> > +   int clone3 (struct clone_args *cl_args, int (*fn)(void *arg),
> > +            void *arg);
> > +   the kernel entry is:
> > +   int clone3 (struct clone_args *cl_args, size_t size);
> > +
> > +   The parameters are passed in registers from userland:
> > +   rdi: cl_args
> > +   rsi: fn
> > +   rdx: arg
> > +
> > +   The kernel expects:
> > +   rax: system call number
> > +   rdi: cl_args
> > +   rsi: size  */
> > +
> > +        .text
> > +ENTRY (__clone3)
> > +     /* Sanity check arguments.  */
> > +     movq    $-EINVAL, %rax
> > +     testq   %rdi, %rdi              /* No NULL cl_args pointer.  */
> > +     jz      SYSCALL_ERROR_LABEL
> > +     testq   %rsi, %rsi              /* No NULL function pointer.  */
> > +     jz      SYSCALL_ERROR_LABEL
> > +
> > +     /* Save the function pointer in R8 which is preserved by the
> > +        syscall.  */
> > +     movq    %rsi, %r8
> > +
> > +     /* Put sizeof (struct clone_args) in ESI.  */
> > +     movl    $CLONE_ARGS_SIZE , %esi
> > +
> > +     /* Do the system call.  */
> > +     movl    $SYS_ify(clone3), %eax
> > +
> > +     /* End FDE now, because in the child the unwind info will be
> > +        wrong.  */
> > +     cfi_endproc
> > +     syscall
> > +
> > +     test    %RAX_LP, %RAX_LP
> > +     jl      SYSCALL_ERROR_LABEL
> > +     jz      L(thread_start)
> > +
> > +     ret
> > +
> > +L(thread_start):
> > +     cfi_startproc
> > +     /* Clearing frame pointer is insufficient, use CFI.  */
> > +     cfi_undefined (rip)
> > +     /* Clear the frame pointer.  The ABI suggests this be done, to mark
> > +        the outermost frame obviously.  */
> > +     xorl    %ebp, %ebp
> > +
> > +     /* Set up arguments for the function call.  */
> > +     movq    %rdx, %rdi      /* Argument.  */
> > +     call    *%r8            /* Call function.  */
> > +     /* Call exit with return value from function call. */
> > +     movq    %rax, %rdi
> > +     movl    $SYS_ify(exit), %eax
> > +     syscall
> > +     cfi_endproc
> > +
> > +     cfi_startproc
> > +PSEUDO_END (__clone3)
> > +
> > +libc_hidden_def (__clone3)
> > +weak_alias (__clone3, clone3)
> >

Thanks.
diff mbox series

Patch

diff --git a/include/clone3.h b/include/clone3.h
new file mode 100644
index 0000000000..124f7ba169
--- /dev/null
+++ b/include/clone3.h
@@ -0,0 +1,14 @@ 
+#ifndef _CLONE3_H
+#include_next <clone3.h>
+
+extern __typeof (clone3) __clone3;
+
+/* The internal wrapper of clone and clone3.  */
+extern __typeof (clone3) __clone_internal;
+
+#ifndef _ISOMAC
+libc_hidden_proto (__clone3)
+libc_hidden_proto (__clone_internal)
+#endif
+
+#endif
diff --git a/include/libc-pointer-arith.h b/include/libc-pointer-arith.h
index 72e722c5aa..04ba537617 100644
--- a/include/libc-pointer-arith.h
+++ b/include/libc-pointer-arith.h
@@ -37,6 +37,9 @@ 
 /* Cast an integer or a pointer VAL to integer with proper type.  */
 # define cast_to_integer(val) ((__integer_if_pointer_type (val)) (val))
 
+/* Cast an integer VAL to void * pointer.  */
+# define cast_to_pointer(val) ((void *) (uintptr_t) (val))
+
 /* Align a value by rounding down to closest size.
    e.g. Using size of 4096, we get this behavior:
 	{4095, 4096, 4097} = {0, 4096, 4096}.  */
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index fb155cf856..7c1f32b84d 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -54,6 +54,8 @@  CFLAGS-malloc.c += -DMORECORE_CLEARS=2
 endif
 
 ifeq ($(subdir),misc)
+gen-as-const-headers += clone-offsets.sym
+
 sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   setfsuid setfsgid epoll_pwait signalfd \
 		   eventfd eventfd_read eventfd_write prlimit \
@@ -64,7 +66,7 @@  sysdep_routines += adjtimex clone umount umount2 readahead sysctl \
 		   time64-support pselect32 \
 		   xstat fxstat lxstat xstat64 fxstat64 lxstat64 \
 		   fxstatat fxstatat64 \
-		   xmknod xmknodat
+		   xmknod xmknodat clone3 clone-internal
 
 CFLAGS-gethostid.c = -fexceptions
 CFLAGS-tee.c = -fexceptions -fasynchronous-unwind-tables
diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
index 220bb2dffe..299d4fef9c 100644
--- a/sysdeps/unix/sysv/linux/Versions
+++ b/sysdeps/unix/sysv/linux/Versions
@@ -179,6 +179,7 @@  libc {
     __sigtimedwait;
     # functions used by nscd
     __netlink_assert_response;
+    __clone_internal;
   }
 }
 
diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c
new file mode 100644
index 0000000000..acafb78d81
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-internal.c
@@ -0,0 +1,80 @@ 
+/* The internal wrapper of clone and clone3.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <sched.h>
+#include <clone3.h>
+#include <libc-pointer-arith.h>	/* For cast_to_pointer.  */
+#include <stackinfo.h>		/* For _STACK_GROWS_{UP,DOWN}.  */
+
+int
+__clone_internal (struct clone_args *cl_args,
+		  int (*fn) (void *arg), void *arg)
+{
+  int ret;
+#ifdef CLONE_ARGS_SIZE_VER0
+  /* Try clone3 first.  */
+  int saved_errno = errno;
+  ret = __clone3 (cl_args, fn, arg);
+  if (ret != -1 || errno != ENOSYS)
+    return ret;
+
+  /* NB: Restore errno since errno may be checked against non-zero
+     return value.  */
+  __set_errno (saved_errno);
+#else
+    /* Check invalid arguments.  */
+  if (cl_args == NULL || fn == NULL)
+    __set_errno (EINVAL);
+#endif
+
+  /* Check invalid clone3 specific bits.  */
+  if ((cl_args->flags & CSIGNAL) != 0
+      || (cl_args->flags & __UINT64_C (0xffffffff00000000)))
+    {
+      __set_errno (EINVAL);
+      return -1;
+    }
+
+  /* Map clone3 arguments to clone arguments.  */
+  int flags = cl_args->flags | cl_args->exit_signal;
+  void *stack = cast_to_pointer (cl_args->stack);
+
+#ifdef __ia64__
+  ret = __clone2 (fn, stack, (size_t) cl_args->stack_size,
+		  flags, arg,
+		  cast_to_pointer (cl_args->parent_tid),
+		  cast_to_pointer (cl_args->tls),
+		  cast_to_pointer (cl_args->child_tid));
+#else
+# if !_STACK_GROWS_DOWN && !_STACK_GROWS_UP
+#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
+# endif
+
+# if _STACK_GROWS_DOWN
+  stack += cl_args->stack_size;
+# endif
+  ret = __clone (fn, stack, flags, arg,
+		 cast_to_pointer (cl_args->parent_tid),
+		 cast_to_pointer (cl_args->tls),
+		 cast_to_pointer (cl_args->child_tid));
+#endif
+  return ret;
+}
+
+libc_hidden_def (__clone_internal)
diff --git a/sysdeps/unix/sysv/linux/clone-offsets.sym b/sysdeps/unix/sysv/linux/clone-offsets.sym
new file mode 100644
index 0000000000..d767e49fc8
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone-offsets.sym
@@ -0,0 +1,5 @@ 
+#include <clone3.h>
+
+--
+
+CLONE_ARGS_SIZE			sizeof (struct clone_args)
diff --git a/sysdeps/unix/sysv/linux/clone3.c b/sysdeps/unix/sysv/linux/clone3.c
new file mode 100644
index 0000000000..b7a41f82e8
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone3.c
@@ -0,0 +1,34 @@ 
+/* The clone3 syscall wrapper.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <stddef.h>
+#include <clone3.h>
+
+int
+__clone3 (struct clone_args *cl_args, int (*fn) (void *arg), void *arg)
+{
+  if (cl_args == NULL || fn == NULL)
+    __set_errno (EINVAL);
+  else
+    __set_errno (ENOSYS);
+  return -1;
+}
+
+libc_hidden_def (__clone3)
+weak_alias (__clone3, clone3)
diff --git a/sysdeps/unix/sysv/linux/clone3.h b/sysdeps/unix/sysv/linux/clone3.h
new file mode 100644
index 0000000000..be8a11a17c
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/clone3.h
@@ -0,0 +1,58 @@ 
+/* The wrapper of clone3.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _CLONE3_H
+#define _CLONE3_H	1
+
+#include <features.h>
+#include <stdint.h>
+#define clone_args __redirect_clone_args
+#include <linux/sched.h>
+#undef clone_args
+
+__BEGIN_DECLS
+
+struct clone_args
+{
+  uint64_t flags;	 /* Flags bit mask.  */
+  uint64_t pidfd;	 /* Where to store PID file descriptor
+			    (pid_t *).  */
+  uint64_t child_tid;	 /* Where to store child TID, in child's memory
+			    (pid_t *).  */
+  uint64_t parent_tid;	 /* Where to store child TID, in parent's memory
+			    (int *). */
+  uint64_t exit_signal;	 /* Signal to deliver to parent on child
+			    termination */
+  uint64_t stack;	 /* The lowest address of stack.  */
+  uint64_t stack_size;	 /* Size of stack.  */
+  uint64_t tls;		 /* Location of new TLS.  */
+  uint64_t set_tid;	 /* Pointer to a pid_t array
+			    (since Linux 5.5).  */
+  uint64_t set_tid_size; /* Number of elements in set_tid
+			    (since Linux 5.5). */
+  uint64_t cgroup;	 /* File descriptor for target cgroup
+			    of child (since Linux 5.7).  */
+};
+
+/* The wrapper of clone3.  */
+extern int clone3 (struct clone_args *cl_args,
+		   int (*__fn) (void *__arg), void *__arg);
+
+__END_DECLS
+
+#endif /* clone3.h */
diff --git a/sysdeps/unix/sysv/linux/createthread.c b/sysdeps/unix/sysv/linux/createthread.c
index bc3409b326..2b5e6cb131 100644
--- a/sysdeps/unix/sysv/linux/createthread.c
+++ b/sysdeps/unix/sysv/linux/createthread.c
@@ -18,6 +18,7 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sched.h>
+#include <clone3.h>
 #include <setjmp.h>
 #include <signal.h>
 #include <stdlib.h>
@@ -28,12 +29,6 @@ 
 
 #include <arch-fork.h>
 
-#ifdef __NR_clone2
-# define ARCH_CLONE __clone2
-#else
-# define ARCH_CLONE __clone
-#endif
-
 /* See the comments in pthread_create.c for the requirements for these
    two macros and the create_thread function.  */
 
@@ -47,7 +42,8 @@  static int start_thread (void *arg) __attribute__ ((noreturn));
 
 static int
 create_thread (struct pthread *pd, const struct pthread_attr *attr,
-	       bool *stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
+	       bool *stopped_start, void *stackaddr, size_t stacksize,
+	       bool *thread_ran)
 {
   /* Determine whether the newly created threads has to be started
      stopped since we have to set the scheduling parameters or set the
@@ -100,9 +96,18 @@  create_thread (struct pthread *pd, const struct pthread_attr *attr,
 
   TLS_DEFINE_INIT_TP (tp, pd);
 
-  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
-				    clone_flags, pd, &pd->tid, tp, &pd->tid)
-			== -1))
+  struct clone_args args =
+    {
+      .flags = clone_flags,
+      .pidfd = (uintptr_t) &pd->tid,
+      .parent_tid = (uintptr_t) &pd->tid,
+      .child_tid = (uintptr_t) &pd->tid,
+      .stack = (uintptr_t) stackaddr,
+      .stack_size = stacksize,
+      .tls = (uintptr_t) tp,
+    };
+  int ret = __clone_internal (&args, &start_thread, pd);
+  if (__glibc_unlikely (ret == -1))
     return errno;
 
   /* It's started now, so if we fail below, we'll have to cancel it
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index 501f8fbccd..92b61e55d3 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -31,6 +31,7 @@ 
 #include <dl-sysdep.h>
 #include <libc-pointer-arith.h>
 #include <ldsodefs.h>
+#include <clone3.h>
 #include "spawn_int.h"
 
 /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
@@ -59,21 +60,6 @@ 
    normal program exit with the exit code 127.  */
 #define SPAWN_ERROR	127
 
-#ifdef __ia64__
-# define CLONE(__fn, __stackbase, __stacksize, __flags, __args) \
-  __clone2 (__fn, __stackbase, __stacksize, __flags, __args, 0, 0, 0)
-#else
-# define CLONE(__fn, __stack, __stacksize, __flags, __args) \
-  __clone (__fn, __stack, __flags, __args)
-#endif
-
-/* Since ia64 wants the stackbase w/clone2, re-use the grows-up macro.  */
-#if _STACK_GROWS_UP || defined (__ia64__)
-# define STACK(__stack, __stack_size) (__stack)
-#elif _STACK_GROWS_DOWN
-# define STACK(__stack, __stack_size) (__stack + __stack_size)
-#endif
-
 
 struct posix_spawn_args
 {
@@ -378,8 +364,14 @@  __spawnix (pid_t * pid, const char *file,
      need for CLONE_SETTLS.  Although parent and child share the same TLS
      namespace, there will be no concurrent access for TLS variables (errno
      for instance).  */
-  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
-		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
+  struct clone_args clone_args =
+    {
+      .flags = CLONE_VM | CLONE_VFORK,
+      .exit_signal = SIGCHLD,
+      .stack = (uintptr_t) stack,
+      .stack_size = stack_size,
+    };
+  new_pid = __clone_internal (&clone_args, __spawni_child, &args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or
diff --git a/sysdeps/unix/sysv/linux/tst-align-clone.c b/sysdeps/unix/sysv/linux/tst-align-clone.c
index 6ace61bac3..5a463e7af3 100644
--- a/sysdeps/unix/sysv/linux/tst-align-clone.c
+++ b/sysdeps/unix/sysv/linux/tst-align-clone.c
@@ -16,6 +16,7 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sched.h>
+#include <clone3.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -39,7 +40,7 @@  f (void *arg)
 }
 
 static int
-do_test (void)
+do_test_clone (bool use_clone_internal)
 {
   bool ok = true;
 
@@ -49,21 +50,41 @@  do_test (void)
     ok = false;
 
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024];
-  pid_t p = __clone2 (f, st, sizeof (st), 0, 0);
+# define STACK_SIZE 256 * 1024
+#else
+# define STACK_SIZE 128 * 1024
+#endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+
+  pid_t p;
+
+  if (use_clone_internal)
+    {
+      struct clone_args clone_args =
+	{
+	  .stack = (uintptr_t) st,
+	  .stack_size = sizeof (st),
+	};
+      p = __clone_internal (&clone_args, f, 0);
+    }
+  else
+    {
+#ifdef __ia64__
+      extern int __clone2 (int (*__fn) (void *__arg),
+			   void *__child_stack_base,
+			   size_t __child_stack_size, int __flags,
+			   void *__arg, ...);
+      p = __clone2 (f, st, sizeof (st), 0, 0);
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
 # if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), 0, 0);
+      p = clone (f, st + sizeof (st), 0, 0);
 # elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, 0, 0);
+      p = clone (f, st, 0, 0);
 # else
 #  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 # endif
 #endif
+    }
   if (p == -1)
     {
       printf("clone failed: %m\n");
@@ -91,5 +112,21 @@  do_test (void)
   return ok ? 0 : 1;
 }
 
+static int
+do_test (void)
+{
+  bool ok = true;
+
+  puts ("in main");
+
+  if (do_test_clone (false) != 0)
+    ok = false;
+
+  if (do_test_clone (true) != 0)
+    ok = false;
+
+  return ok ? 0 : 1;
+}
+
 #define TEST_FUNCTION do_test ()
 #include "../test-skeleton.c"
diff --git a/sysdeps/unix/sysv/linux/tst-clone.c b/sysdeps/unix/sysv/linux/tst-clone.c
index e6ae0106ef..855f48121e 100644
--- a/sysdeps/unix/sysv/linux/tst-clone.c
+++ b/sysdeps/unix/sysv/linux/tst-clone.c
@@ -22,6 +22,8 @@ 
 #include <stdlib.h>
 #include <unistd.h>
 #include <sched.h>
+#include <clone3.h>
+#include <stdbool.h>
 
 #ifdef __ia64__
 extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
@@ -35,14 +37,17 @@  int child_fn(void *arg)
 }
 
 static int
-do_test (void)
+do_test_clone (bool use_clone_internal)
 {
   int result;
 
+  if (use_clone_internal)
+    result = __clone_internal (NULL, child_fn, NULL);
+  else
 #ifdef __ia64__
-  result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
+    result = __clone2 (child_fn, NULL, 0, 0, NULL, NULL, NULL);
 #else
-  result = clone (child_fn, NULL, 0, NULL);
+    result = clone (child_fn, NULL, 0, NULL);
 #endif
 
   if (errno != EINVAL || result != -1)
@@ -52,6 +57,18 @@  do_test (void)
       return 1;
     }
 
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  if (do_test_clone (false) != 0)
+    return 1;
+
+  if (do_test_clone (true) != 0)
+    return 1;
+
   puts ("All OK");
   return 0;
 }
diff --git a/sysdeps/unix/sysv/linux/tst-clone2.c b/sysdeps/unix/sysv/linux/tst-clone2.c
index ce36c70c0d..1df336f8c6 100644
--- a/sysdeps/unix/sysv/linux/tst-clone2.c
+++ b/sysdeps/unix/sysv/linux/tst-clone2.c
@@ -17,6 +17,7 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sched.h>
+#include <clone3.h>
 #include <signal.h>
 #include <string.h>
 #include <stdio.h>
@@ -58,7 +59,7 @@  f (void *a)
 
 
 static int
-do_test (void)
+do_test_clone (bool use_clone_internal)
 {
   sig = SIGRTMIN;
   sigset_t ss;
@@ -70,23 +71,43 @@  do_test (void)
   if (pipe2 (pipefd, O_CLOEXEC))
     FAIL_EXIT1 ("pipe failed: %m");
 
-  int clone_flags = 0;
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024] __attribute__ ((aligned));
-  pid_t p = __clone2 (f, st, sizeof (st), clone_flags, 0);
-#else
-  char st[128 * 1024] __attribute__ ((aligned));
-#if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), clone_flags, 0);
-#elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, clone_flags, 0);
+# define STACK_SIZE 256 * 1024
 #else
-#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
+# define STACK_SIZE 128 * 1024
 #endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+
+  pid_t p;
+
+  if (use_clone_internal)
+    {
+      struct clone_args clone_args =
+	{
+	  .stack = (uintptr_t) st,
+	  .stack_size = sizeof (st),
+	};
+      p = __clone_internal (&clone_args, f, 0);
+    }
+  else
+    {
+      int clone_flags = 0;
+#ifdef __ia64__
+      extern int __clone2 (int (*__fn) (void *__arg),
+			   void *__child_stack_base,
+			   size_t __child_stack_size, int __flags,
+			   void *__arg, ...);
+      p = __clone2 (f, st, sizeof (st), clone_flags, 0);
+#else
+# if _STACK_GROWS_DOWN
+      p = clone (f, st + sizeof (st), clone_flags, 0);
+# elif _STACK_GROWS_UP
+      p = clone (f, st, clone_flags, 0);
+# else
+#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
+# endif
 #endif
+    }
 
   close (pipefd[1]);
 
@@ -143,4 +164,16 @@  do_test (void)
   return ret;
 }
 
+static int
+do_test (void)
+{
+  if (do_test_clone (false) != 0)
+    return 1;
+
+  if (do_test_clone (true) != 0)
+    return 1;
+
+  return 0;
+}
+
 #include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-clone3.c b/sysdeps/unix/sysv/linux/tst-clone3.c
index 1a1bfd4586..f4eabfc5a7 100644
--- a/sysdeps/unix/sysv/linux/tst-clone3.c
+++ b/sysdeps/unix/sysv/linux/tst-clone3.c
@@ -18,6 +18,8 @@ 
 
 #include <string.h>
 #include <sched.h>
+#include <clone3.h>
+#include <stdbool.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/syscall.h>
@@ -64,7 +66,7 @@  futex_wait (int *futexp, int val)
 }
 
 static int
-do_test (void)
+do_test_clone (bool use_clone_internal)
 {
   char st[1024] __attribute__ ((aligned));
   int clone_flags = CLONE_THREAD;
@@ -78,23 +80,39 @@  do_test (void)
   pid_t ctid = CTID_INIT_VAL;
   pid_t tid;
 
+  if (use_clone_internal)
+    {
+      struct clone_args clone_args =
+	{
+	  .flags = clone_flags & ~CSIGNAL,
+	  .exit_signal = clone_flags & CSIGNAL,
+	  .stack = (uintptr_t) st,
+	  .stack_size = sizeof (st),
+	  .child_tid = (uintptr_t) &ctid,
+	};
+      tid = __clone_internal (&clone_args, f, NULL);
+    }
+  else
+    {
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
-		  /* tls */ NULL, &ctid);
-#else
-#if _STACK_GROWS_DOWN
-  tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
-	       /* tls */ NULL, &ctid);
-#elif _STACK_GROWS_UP
-  tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
-	       &ctid);
+      extern int __clone2 (int (*__fn) (void *__arg),
+			   void *__child_stack_base,
+			   size_t __child_stack_size, int __flags,
+			   void *__arg, ...);
+      tid = __clone2 (f, st, sizeof (st), clone_flags, NULL, /* ptid */ NULL,
+		      /* tls */ NULL, &ctid);
 #else
-#error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
-#endif
+# if _STACK_GROWS_DOWN
+      tid = clone (f, st + sizeof (st), clone_flags, NULL, /* ptid */ NULL,
+		   /* tls */ NULL, &ctid);
+# elif _STACK_GROWS_UP
+      tid = clone (f, st, clone_flags, NULL, /* ptid */ NULL, /* tls */ NULL,
+		   &ctid);
+# else
+#  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
+# endif
 #endif
+    }
   if (tid == -1)
     FAIL_EXIT1 ("clone failed: %m");
 
@@ -103,5 +121,17 @@  do_test (void)
   return 2;
 }
 
+static int
+do_test (void)
+{
+  if (do_test_clone (false) != 2)
+    return 1;
+
+  if (do_test_clone (true) != 2)
+    return 1;
+
+  return 2;
+}
+
 #define EXPECTED_STATUS 2
 #include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-getpid1.c b/sysdeps/unix/sysv/linux/tst-getpid1.c
index 253ebf2e15..1528743319 100644
--- a/sysdeps/unix/sysv/linux/tst-getpid1.c
+++ b/sysdeps/unix/sysv/linux/tst-getpid1.c
@@ -1,4 +1,6 @@ 
 #include <sched.h>
+#include <clone3.h>
+#include <stdbool.h>
 #include <signal.h>
 #include <string.h>
 #include <stdio.h>
@@ -27,7 +29,7 @@  f (void *a)
 
 
 static int
-do_test (void)
+do_test_clone (bool use_clone_internal)
 {
   int mypid = getpid ();
 
@@ -42,21 +44,43 @@  do_test (void)
     }
 
 #ifdef __ia64__
-  extern int __clone2 (int (*__fn) (void *__arg), void *__child_stack_base,
-		       size_t __child_stack_size, int __flags,
-		       void *__arg, ...);
-  char st[256 * 1024] __attribute__ ((aligned));
-  pid_t p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
+# define STACK_SIZE 256 * 1024
+#else
+# define STACK_SIZE 128 * 1024
+#endif
+  char st[STACK_SIZE] __attribute__ ((aligned));
+
+  pid_t p;
+
+  if (use_clone_internal)
+    {
+      struct clone_args clone_args =
+	{
+	  .flags = TEST_CLONE_FLAGS & ~CSIGNAL,
+	  .exit_signal = TEST_CLONE_FLAGS & CSIGNAL,
+	  .stack = (uintptr_t) st,
+	  .stack_size = sizeof (st),
+	};
+      p = __clone_internal (&clone_args, f, 0);
+    }
+  else
+    {
+#ifdef __ia64__
+      extern int __clone2 (int (*__fn) (void *__arg),
+			   void *__child_stack_base,
+			   size_t __child_stack_size, int __flags,
+			   void *__arg, ...);
+      p = __clone2 (f, st, sizeof (st), TEST_CLONE_FLAGS, 0);
 #else
-  char st[128 * 1024] __attribute__ ((aligned));
 # if _STACK_GROWS_DOWN
-  pid_t p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
+      p = clone (f, st + sizeof (st), TEST_CLONE_FLAGS, 0);
 # elif _STACK_GROWS_UP
-  pid_t p = clone (f, st, TEST_CLONE_FLAGS, 0);
+      p = clone (f, st, TEST_CLONE_FLAGS, 0);
 # else
 #  error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 # endif
 #endif
+    }
   if (p == -1)
     {
       printf("clone failed: %m\n");
@@ -118,5 +142,17 @@  do_test (void)
   return 0;
 }
 
+static int
+do_test (void)
+{
+  if (do_test_clone (false) != 0)
+    return 1;
+
+  if (do_test_clone (true) != 0)
+    return 1;
+
+  return 0;
+}
+
 #define TEST_FUNCTION do_test ()
 #include "../test-skeleton.c"
diff --git a/sysdeps/unix/sysv/linux/x86_64/clone3.S b/sysdeps/unix/sysv/linux/x86_64/clone3.S
new file mode 100644
index 0000000000..8a7a9a7478
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86_64/clone3.S
@@ -0,0 +1,92 @@ 
+/* The clone3 syscall wrapper.  Linux/x86-64 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* clone3() is even more special than fork() as it mucks with stacks
+   and invokes a function in the right context after its all over.  */
+
+#include <sysdep.h>
+#include <clone-offsets.h>
+
+/* The userland implementation is:
+   int clone3 (struct clone_args *cl_args, int (*fn)(void *arg),
+	       void *arg);
+   the kernel entry is:
+   int clone3 (struct clone_args *cl_args, size_t size);
+
+   The parameters are passed in registers from userland:
+   rdi: cl_args
+   rsi: fn
+   rdx: arg
+
+   The kernel expects:
+   rax: system call number
+   rdi: cl_args
+   rsi: size  */
+
+        .text
+ENTRY (__clone3)
+	/* Sanity check arguments.  */
+	movq	$-EINVAL, %rax
+	testq	%rdi, %rdi		/* No NULL cl_args pointer.  */
+	jz	SYSCALL_ERROR_LABEL
+	testq	%rsi, %rsi		/* No NULL function pointer.  */
+	jz	SYSCALL_ERROR_LABEL
+
+	/* Save the function pointer in R8 which is preserved by the
+	   syscall.  */
+	movq	%rsi, %r8
+
+	/* Put sizeof (struct clone_args) in ESI.  */
+	movl	$CLONE_ARGS_SIZE , %esi
+
+	/* Do the system call.  */
+	movl	$SYS_ify(clone3), %eax
+
+	/* End FDE now, because in the child the unwind info will be
+	   wrong.  */
+	cfi_endproc
+	syscall
+
+	test	%RAX_LP, %RAX_LP
+	jl	SYSCALL_ERROR_LABEL
+	jz	L(thread_start)
+
+	ret
+
+L(thread_start):
+	cfi_startproc
+	/* Clearing frame pointer is insufficient, use CFI.  */
+	cfi_undefined (rip)
+	/* Clear the frame pointer.  The ABI suggests this be done, to mark
+	   the outermost frame obviously.  */
+	xorl	%ebp, %ebp
+
+	/* Set up arguments for the function call.  */
+	movq	%rdx, %rdi	/* Argument.  */
+	call	*%r8		/* Call function.  */
+	/* Call exit with return value from function call. */
+	movq	%rax, %rdi
+	movl	$SYS_ify(exit), %eax
+	syscall
+	cfi_endproc
+
+	cfi_startproc
+PSEUDO_END (__clone3)
+
+libc_hidden_def (__clone3)
+weak_alias (__clone3, clone3)