diff mbox series

[RFC] clone3: add CLONE3_RESET_SIGHAND

Message ID 20191008134417.16113-1-christian.brauner@ubuntu.com
State New
Headers show
Series [RFC] clone3: add CLONE3_RESET_SIGHAND | expand

Commit Message

Christian Brauner Oct. 8, 2019, 1:44 p.m. UTC
Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL.
Mutually exclusive with CLONE_SIGHAND to not disturb other threads
signal handler.

Questions for glibc before going to send this for review on lkml:
- Is it sufficient for glibc to get EINVAL when CLONE3_RESET_SIGNALS is
  passed to determine kernel support for it?
- Do you really want to have only those signals set to SIG_DFL which are
  not SIG_IGN or do you want to enforce _all_ signals are set to
  SIG_DFL even if they are SIG_IGN?

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 include/uapi/linux/sched.h | 3 +++
 kernel/fork.c              | 8 +++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

Comments

Christian Brauner Oct. 8, 2019, 2:14 p.m. UTC | #1
On Tue, Oct 08, 2019 at 03:44:17PM +0200, Christian Brauner wrote:
> Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL.
> Mutually exclusive with CLONE_SIGHAND to not disturb other threads
> signal handler.
> 
> Questions for glibc before going to send this for review on lkml:
> - Is it sufficient for glibc to get EINVAL when CLONE3_RESET_SIGNALS is
>   passed to determine kernel support for it?
> - Do you really want to have only those signals set to SIG_DFL which are
>   not SIG_IGN or do you want to enforce _all_ signals are set to
>   SIG_DFL even if they are SIG_IGN?
> 
> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
> ---
>  include/uapi/linux/sched.h | 3 +++
>  kernel/fork.c              | 8 +++++++-
>  2 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index 99335e1f4a27..e3bc43efbbba 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -33,6 +33,9 @@
>  #define CLONE_NEWNET		0x40000000	/* New network namespace */
>  #define CLONE_IO		0x80000000	/* Clone io context */
>  
> +/* Flags for the clone3() syscall */
> +#define CLONE3_RESET_SIGHAND 0x100000000ULL /* Reset any signal handler which is not SIG_IGN or SIG_DFL to SIG_DFL. */
> +
>  #ifndef __ASSEMBLY__
>  /**
>   * struct clone_args - arguments for the clone3 syscall
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1f6c45f6a734..3bced4a2931e 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1516,6 +1516,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
>  	refcount_set(&sig->count, 1);
>  	spin_lock_irq(&current->sighand->siglock);
>  	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
> +	/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
> +	if (clone_flags & CLONE3_RESET_SIGHAND)
> +		flush_signal_handlers(tsk, 0);
>  	spin_unlock_irq(&current->sighand->siglock);
>  	return 0;
>  }
> @@ -2567,7 +2570,7 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
>  	 * All lower bits of the flag word are taken.
>  	 * Verify that no other unknown flags are passed along.
>  	 */
> -	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
> +	if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE3_RESET_SIGNALS))
>  		return false;
>  
>  	/*
> @@ -2577,6 +2580,9 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
>  	if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
>  		return false;
>  
> +	if (kargs->flags & (CLONE_SIGHAND | CLONE3_RESET_SIGHAND))
> +		return false;

That obviously should've been:

if ((kargs->flags & (CLONE_SIGHAND | CLONE3_RESET_SIGHAND)) ==
    (CLONE_SIGHAND | CLONE3_RESET_SIGHAND))
	return false;

but you get the gist.

Thanks!
Christian
Adhemerval Zanella Oct. 8, 2019, 2:20 p.m. UTC | #2
On 08/10/2019 10:44, Christian Brauner wrote:
> Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL.
> Mutually exclusive with CLONE_SIGHAND to not disturb other threads
> signal handler.
> 
> Questions for glibc before going to send this for review on lkml:
> - Is it sufficient for glibc to get EINVAL when CLONE3_RESET_SIGNALS is
>   passed to determine kernel support for it?

It makes the clone/clone3 call on posix_spawn somewhat more complex, something
like:

#ifdef __ASSUME_CLONE3
   /* This clone3 is modelled with glibc clone semantics where a function
      pointer is passed and executed in child process.  */
   new_pid = clone3 (__spawni_child, __spawni_args, 
		     &((struct clone_args) {CLONE3_RESET_SIGHAND, ... }), ...);
   if (new_pid == -1 && errno == EINVAL)
     {
       __spawni_args.xflags |= RESET_SIGNALS;
        new_pid = clone3 (__spawni_child, __spawni_args, 
		          &((struct clone_args) {0, ... }), ...);
     }
#else
   __spawni_args.xflags |= RESET_SIGNALS;
   new_pid = clone (__spawni_child, __spawni_args, ...);
#endif

We can add __ASSUME_CLONE3_RESET_SIGHAND which would allow to not require
check for EINVAL, but it would be another build permutation...

Is there a way to advertise it to *child* process somehow? Now that flags
is passed in a struct, maybe reset the flags that current kernel does not
accept so underlying process could act accordantly? 

> - Do you really want to have only those signals set to SIG_DFL which are
>   not SIG_IGN or do you want to enforce _all_ signals are set to
>   SIG_DFL even if they are SIG_IGN?

We only current support POSIX_SPAWN_SETSIGDEF, but Solaris 11, for instance,
also supports POSIX_SPAWN_SETSIGIGN_NP as an extension. I am not sure how
useful this would be, neither an I can think of an specific usercase where
it would be required.

> 
> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
> ---
>  include/uapi/linux/sched.h | 3 +++
>  kernel/fork.c              | 8 +++++++-
>  2 files changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index 99335e1f4a27..e3bc43efbbba 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -33,6 +33,9 @@
>  #define CLONE_NEWNET		0x40000000	/* New network namespace */
>  #define CLONE_IO		0x80000000	/* Clone io context */
>  
> +/* Flags for the clone3() syscall */
> +#define CLONE3_RESET_SIGHAND 0x100000000ULL /* Reset any signal handler which is not SIG_IGN or SIG_DFL to SIG_DFL. */
> +
>  #ifndef __ASSEMBLY__
>  /**
>   * struct clone_args - arguments for the clone3 syscall
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 1f6c45f6a734..3bced4a2931e 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1516,6 +1516,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
>  	refcount_set(&sig->count, 1);
>  	spin_lock_irq(&current->sighand->siglock);
>  	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
> +	/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
> +	if (clone_flags & CLONE3_RESET_SIGHAND)
> +		flush_signal_handlers(tsk, 0);
>  	spin_unlock_irq(&current->sighand->siglock);
>  	return 0;
>  }
> @@ -2567,7 +2570,7 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
>  	 * All lower bits of the flag word are taken.
>  	 * Verify that no other unknown flags are passed along.
>  	 */
> -	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
> +	if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE3_RESET_SIGNALS))
>  		return false;
>  
>  	/*
> @@ -2577,6 +2580,9 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs)
>  	if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
>  		return false;
>  
> +	if (kargs->flags & (CLONE_SIGHAND | CLONE3_RESET_SIGHAND))
> +		return false;
> +
>  	if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
>  	    kargs->exit_signal)
>  		return false;
>
Christian Brauner Oct. 9, 2019, 10:48 a.m. UTC | #3
On Tue, Oct 08, 2019 at 11:20:08AM -0300, Adhemerval Zanella wrote:
> 
> 
> On 08/10/2019 10:44, Christian Brauner wrote:
> > Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL.
> > Mutually exclusive with CLONE_SIGHAND to not disturb other threads
> > signal handler.
> > 
> > Questions for glibc before going to send this for review on lkml:
> > - Is it sufficient for glibc to get EINVAL when CLONE3_RESET_SIGNALS is
> >   passed to determine kernel support for it?
> 
> It makes the clone/clone3 call on posix_spawn somewhat more complex, something
> like:
> 
> #ifdef __ASSUME_CLONE3
>    /* This clone3 is modelled with glibc clone semantics where a function
>       pointer is passed and executed in child process.  */
>    new_pid = clone3 (__spawni_child, __spawni_args, 
> 		     &((struct clone_args) {CLONE3_RESET_SIGHAND, ... }), ...);
>    if (new_pid == -1 && errno == EINVAL)
>      {
>        __spawni_args.xflags |= RESET_SIGNALS;
>         new_pid = clone3 (__spawni_child, __spawni_args, 
> 		          &((struct clone_args) {0, ... }), ...);
>      }
> #else
>    __spawni_args.xflags |= RESET_SIGNALS;
>    new_pid = clone (__spawni_child, __spawni_args, ...);
> #endif
> 
> We can add __ASSUME_CLONE3_RESET_SIGHAND which would allow to not require
> check for EINVAL, but it would be another build permutation...
> 
> Is there a way to advertise it to *child* process somehow? Now that flags
> is passed in a struct, maybe reset the flags that current kernel does not
> accept so underlying process could act accordantly? 

I've been thinking about two things how to do this:
- mask the flags that the kernel does not support
- add another argument to struct clone_args that is "known_flags"
  when the syscall returns it'll be set to all the flags this kernel
  knows about

The advantage with the latter method is that the struct get's extended
with an additonal argument. And since clone3() deals with larger structs
correctly you can pass a larger struct from userspace with the
known_flags member included and if it is non-zero when clone3() returns
you know a) that this kernel supports the known_flags extension and b)
you know what flags this kernel supports...

Florian, Adhmerval et al. what would you prefer?

Christian
Florian Weimer Oct. 9, 2019, 11:04 a.m. UTC | #4
* Christian Brauner:

> I've been thinking about two things how to do this:
> - mask the flags that the kernel does not support

That doesn't look fully backwards-compatible to me.  The argument isn't
currently read/write, is it?  It would work for us though.

> - add another argument to struct clone_args that is "known_flags"
>   when the syscall returns it'll be set to all the flags this kernel
>   knows about

This needs some sort of protocol to detect whether the argument was
updated.  I suppose we could define CLONE3_INITIALLY_SUPPORTED_FLAGS
with all the flag bits currently supported and tell developers to
initialize struct clone_args with:

  .known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS,

Then the result would be correct whether or not known_flags is supported
by the kernel or not.  This too would work fine for glibc internal use.

By theway, I don't think we have a good userspace API story for
extensible *output* arguments yet.  Every system call does things a
little bit differently there.

Thanks,
Florian
Christian Brauner Oct. 9, 2019, 11:12 a.m. UTC | #5
On Wed, Oct 09, 2019 at 01:04:03PM +0200, Florian Weimer wrote:
> * Christian Brauner:
> 
> > I've been thinking about two things how to do this:
> > - mask the flags that the kernel does not support
> 
> That doesn't look fully backwards-compatible to me.  The argument isn't
> currently read/write, is it?  It would work for us though.
> 
> > - add another argument to struct clone_args that is "known_flags"
> >   when the syscall returns it'll be set to all the flags this kernel
> >   knows about
> 
> This needs some sort of protocol to detect whether the argument was
> updated.  I suppose we could define CLONE3_INITIALLY_SUPPORTED_FLAGS
> with all the flag bits currently supported and tell developers to
> initialize struct clone_args with:
> 
>   .known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS,

That won't work. Older kernels will verify that parts of the struct that
are not known are set to 0.
I wonder, what is stopping you from

struct clone args args = {
	.known_flags = 0,
};

pid_t pid = clone3(&args, sizeof(args));
if (pid < 0)
	return -1;

######### kernel code ############
/* on a kernel that is aware of known_flags */
kargs->known_flags = CLONE3_SUPPORTED_FLAGS;
##################################

if (!args.known_flags)
	/* kernel doesn't not support the known_flags extension */

if (args.known_flags & NEW_FLAG_I_CARE_ABOUT)
	/* 
	 * kernel does support the known_flags extension and does
	 * support the feature I care about 
	 */

> 
> Then the result would be correct whether or not known_flags is supported
> by the kernel or not.  This too would work fine for glibc internal use.
> 
> By theway, I don't think we have a good userspace API story for
> extensible *output* arguments yet.  Every system call does things a
> little bit differently there.

Yeah, I'm slowly pushing in this direction but you're right it is nasty
currently... Traditionally syscalls mix and match output arguments in a
single struct. I've honestly have seen now need for a separate output
struct for clone3(). But yeah, we should probably standardize this. We
should have a session somewhere or a mailing list discussion at least.
Though I'd prefer the in-person discussion.

Thanks!
Christian
Dmitry V. Levin Oct. 9, 2019, 11:14 a.m. UTC | #6
On Wed, Oct 09, 2019 at 01:04:03PM +0200, Florian Weimer wrote:
[...]
> By theway, I don't think we have a good userspace API story for
> extensible *output* arguments yet.

I believe PTRACE_GET_SYSCALL_INFO does the right thing wrt extensible
output arguments. :)
Florian Weimer Oct. 9, 2019, 11:56 a.m. UTC | #7
* Christian Brauner:

> On Wed, Oct 09, 2019 at 01:04:03PM +0200, Florian Weimer wrote:
>> * Christian Brauner:
>> 
>> > I've been thinking about two things how to do this:
>> > - mask the flags that the kernel does not support
>> 
>> That doesn't look fully backwards-compatible to me.  The argument isn't
>> currently read/write, is it?  It would work for us though.
>> 
>> > - add another argument to struct clone_args that is "known_flags"
>> >   when the syscall returns it'll be set to all the flags this kernel
>> >   knows about
>> 
>> This needs some sort of protocol to detect whether the argument was
>> updated.  I suppose we could define CLONE3_INITIALLY_SUPPORTED_FLAGS
>> with all the flag bits currently supported and tell developers to
>> initialize struct clone_args with:
>> 
>>   .known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS,
>
> That won't work. Older kernels will verify that parts of the struct that
> are not known are set to 0.

Good point.

> I wonder, what is stopping you from
>
> struct clone args args = {
> 	.known_flags = 0,
> };
>
> pid_t pid = clone3(&args, sizeof(args));
> if (pid < 0)
> 	return -1;
>
> ######### kernel code ############
> /* on a kernel that is aware of known_flags */
> kargs->known_flags = CLONE3_SUPPORTED_FLAGS;
> ##################################
>
> if (!args.known_flags)
> 	/* kernel doesn't not support the known_flags extension */
>
> if (args.known_flags & NEW_FLAG_I_CARE_ABOUT)
> 	/* 
> 	 * kernel does support the known_flags extension and does
> 	 * support the feature I care about 
> 	 */

With this construct, the application programmer needs to remember which
flags are old and new (predate and postdate known_flags).  It's too easy
to make mistakes there.

What about this?

  pid_t pid = clone3 (&args, sizeof (args));
  if (pid < 0)
    return -1;

  if (args.known_flags == 0)
    args.known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS;

  if (args.known_flags & NEW_FLAG_I_CARE_ABOUT)
    /* Kernel does support the known_flags extension and does
       support the feature I care about.  */

We could hide this in the clone3 wrapper for glibc if we start out with
a struct clone_args that has this member.

Thanks,
Florian
Christian Brauner Oct. 9, 2019, 11:58 a.m. UTC | #8
On Wed, Oct 09, 2019 at 01:56:15PM +0200, Florian Weimer wrote:
> * Christian Brauner:
> 
> > On Wed, Oct 09, 2019 at 01:04:03PM +0200, Florian Weimer wrote:
> >> * Christian Brauner:
> >> 
> >> > I've been thinking about two things how to do this:
> >> > - mask the flags that the kernel does not support
> >> 
> >> That doesn't look fully backwards-compatible to me.  The argument isn't
> >> currently read/write, is it?  It would work for us though.
> >> 
> >> > - add another argument to struct clone_args that is "known_flags"
> >> >   when the syscall returns it'll be set to all the flags this kernel
> >> >   knows about
> >> 
> >> This needs some sort of protocol to detect whether the argument was
> >> updated.  I suppose we could define CLONE3_INITIALLY_SUPPORTED_FLAGS
> >> with all the flag bits currently supported and tell developers to
> >> initialize struct clone_args with:
> >> 
> >>   .known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS,
> >
> > That won't work. Older kernels will verify that parts of the struct that
> > are not known are set to 0.
> 
> Good point.
> 
> > I wonder, what is stopping you from
> >
> > struct clone args args = {
> > 	.known_flags = 0,
> > };
> >
> > pid_t pid = clone3(&args, sizeof(args));
> > if (pid < 0)
> > 	return -1;
> >
> > ######### kernel code ############
> > /* on a kernel that is aware of known_flags */
> > kargs->known_flags = CLONE3_SUPPORTED_FLAGS;
> > ##################################
> >
> > if (!args.known_flags)
> > 	/* kernel doesn't not support the known_flags extension */
> >
> > if (args.known_flags & NEW_FLAG_I_CARE_ABOUT)
> > 	/* 
> > 	 * kernel does support the known_flags extension and does
> > 	 * support the feature I care about 
> > 	 */
> 
> With this construct, the application programmer needs to remember which
> flags are old and new (predate and postdate known_flags).  It's too easy
> to make mistakes there.
> 
> What about this?
> 
>   pid_t pid = clone3 (&args, sizeof (args));
>   if (pid < 0)
>     return -1;
> 
>   if (args.known_flags == 0)
>     args.known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS;
> 
>   if (args.known_flags & NEW_FLAG_I_CARE_ABOUT)
>     /* Kernel does support the known_flags extension and does
>        support the feature I care about.  */
> 
> We could hide this in the clone3 wrapper for glibc if we start out with
> a struct clone_args that has this member.

So the kernel semantics I suggested but when the kernel does not support
it have and doesn't set it have glibc set this?
Yeah, that sounds like a good idea to me!

Thanks!
Christian
Florian Weimer Oct. 9, 2019, 12:01 p.m. UTC | #9
* Christian Brauner:

>> With this construct, the application programmer needs to remember which
>> flags are old and new (predate and postdate known_flags).  It's too easy
>> to make mistakes there.
>> 
>> What about this?
>> 
>>   pid_t pid = clone3 (&args, sizeof (args));
>>   if (pid < 0)
>>     return -1;
>> 
>>   if (args.known_flags == 0)
>>     args.known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS;
>> 
>>   if (args.known_flags & NEW_FLAG_I_CARE_ABOUT)
>>     /* Kernel does support the known_flags extension and does
>>        support the feature I care about.  */
>> 
>> We could hide this in the clone3 wrapper for glibc if we start out with
>> a struct clone_args that has this member.
>
> So the kernel semantics I suggested but when the kernel does not support
> it have and doesn't set it have glibc set this?

Exactly.

> Yeah, that sounds like a good idea to me!

Good.  Please get this into the kernel. 8-)

Thanks,
Florian
Christian Brauner Oct. 9, 2019, 1:33 p.m. UTC | #10
On Wed, Oct 09, 2019 at 02:01:02PM +0200, Florian Weimer wrote:
> * Christian Brauner:
> 
> >> With this construct, the application programmer needs to remember which
> >> flags are old and new (predate and postdate known_flags).  It's too easy
> >> to make mistakes there.
> >> 
> >> What about this?
> >> 
> >>   pid_t pid = clone3 (&args, sizeof (args));
> >>   if (pid < 0)
> >>     return -1;
> >> 
> >>   if (args.known_flags == 0)
> >>     args.known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS;
> >> 
> >>   if (args.known_flags & NEW_FLAG_I_CARE_ABOUT)
> >>     /* Kernel does support the known_flags extension and does
> >>        support the feature I care about.  */
> >> 
> >> We could hide this in the clone3 wrapper for glibc if we start out with
> >> a struct clone_args that has this member.
> >
> > So the kernel semantics I suggested but when the kernel does not support
> > it have and doesn't set it have glibc set this?
> 
> Exactly.
> 
> > Yeah, that sounds like a good idea to me!
> 
> Good.  Please get this into the kernel. 8-)

Will do. :)

Thanks!
Christian
Christian Brauner Oct. 10, 2019, 10:51 a.m. UTC | #11
On Wed, Oct 09, 2019 at 03:33:41PM +0200, Christian Brauner wrote:
> On Wed, Oct 09, 2019 at 02:01:02PM +0200, Florian Weimer wrote:
> > * Christian Brauner:
> > 
> > >> With this construct, the application programmer needs to remember which
> > >> flags are old and new (predate and postdate known_flags).  It's too easy
> > >> to make mistakes there.
> > >> 
> > >> What about this?
> > >> 
> > >>   pid_t pid = clone3 (&args, sizeof (args));
> > >>   if (pid < 0)
> > >>     return -1;
> > >> 
> > >>   if (args.known_flags == 0)
> > >>     args.known_flags = CLONE3_INITIALLY_SUPPORTED_FLAGS;
> > >> 
> > >>   if (args.known_flags & NEW_FLAG_I_CARE_ABOUT)
> > >>     /* Kernel does support the known_flags extension and does
> > >>        support the feature I care about.  */
> > >> 
> > >> We could hide this in the clone3 wrapper for glibc if we start out with
> > >> a struct clone_args that has this member.
> > >
> > > So the kernel semantics I suggested but when the kernel does not support
> > > it have and doesn't set it have glibc set this?
> > 
> > Exactly.
> > 
> > > Yeah, that sounds like a good idea to me!
> > 
> > Good.  Please get this into the kernel. 8-)
> 
> Will do. :)

Ok, I need to be annoying once more. (As if there wasn't enough of that
already...)
So kernel-side a good friend of mine - Aleksa - and I are on the way to
push through _some_ standards... That very much includes how we check
for new extensions in syscalls. So we just discussed my earlier proposal
(Sorry for the lag the distance Australia to Germany makes things a bit
difficult.). One problem we'll have with adding a new member is that we
arguably need one for each additional flag member we add. That means
we'd need:

struct clone_args /* initial struct */ {
	__aligned_u64 flags;
	__aligned_u64 supported_flags;
}

and later

struct clone_args /* extended struct: new flags argument added */ {
	__aligned_u64 flags;
	__aligned_u64 supported_flags;
	__aligned_u64 flags2;
	__aligned_u64 supported_flags2;
}

which is not ideal. So we were thinking about a different protocol for
syscalls which embedd flags arguments in structs.

We'd introduce a new flag

#define SYSCALL_CHECK_FLAGS ((1 << 63)ULL)

that can be used by e.g. clone3() and openat2(). Then you can call the
syscall with:

struct clone_args args = {
	.flags |= SYSCALL_CHECK_FLAGS,
};
pid = clone3(&args, sizeof(args));

/* 
 * The kernel now sets all flags it supports in _all_ flags arguments
 * present in the struct.
 */

SYSCALL_CHECK_FLAGS is only valid in the initial flags argument not in
any added flags arguments.
If SYSCALL_CHECK_FLAGS is not supported then you'll get EINVAL and the
SYSCALL_CHECK_FLAGS flag is still present in the flags argument of the
struct. If it is supported it'll be masked out and all supported flag
bits will be set by the kernel in _all_ flag arguments that are present
in the structure.
Florian, thoughts?

Thanks!
Christian
Florian Weimer Oct. 10, 2019, 2:49 p.m. UTC | #12
* Christian Brauner:

> Ok, I need to be annoying once more. (As if there wasn't enough of that
> already...)
> So kernel-side a good friend of mine - Aleksa - and I are on the way to
> push through _some_ standards... That very much includes how we check
> for new extensions in syscalls. So we just discussed my earlier proposal
> (Sorry for the lag the distance Australia to Germany makes things a bit
> difficult.). One problem we'll have with adding a new member is that we
> arguably need one for each additional flag member we add. That means
> we'd need:
>
> struct clone_args /* initial struct */ {
> 	__aligned_u64 flags;
> 	__aligned_u64 supported_flags;
> }
>
> and later
>
> struct clone_args /* extended struct: new flags argument added */ {
> 	__aligned_u64 flags;
> 	__aligned_u64 supported_flags;
> 	__aligned_u64 flags2;
> 	__aligned_u64 supported_flags2;
> }
>
> which is not ideal. So we were thinking about a different protocol for
> syscalls which embedd flags arguments in structs.

Is that really a bad problem?

> We'd introduce a new flag
>
> #define SYSCALL_CHECK_FLAGS ((1 << 63)ULL)
>
> that can be used by e.g. clone3() and openat2(). Then you can call the
> syscall with:
>
> struct clone_args args = {
> 	.flags |= SYSCALL_CHECK_FLAGS,
> };
> pid = clone3(&args, sizeof(args));
>
> /* 
>  * The kernel now sets all flags it supports in _all_ flags arguments
>  * present in the struct.
>  */
>
> SYSCALL_CHECK_FLAGS is only valid in the initial flags argument not in
> any added flags arguments.
> If SYSCALL_CHECK_FLAGS is not supported then you'll get EINVAL and the
> SYSCALL_CHECK_FLAGS flag is still present in the flags argument of the
> struct. If it is supported it'll be masked out and all supported flag
> bits will be set by the kernel in _all_ flag arguments that are present
> in the structure.
> Florian, thoughts?

Will SYSCALL_CHECK_FLAGS be for probing only and not perform any action
at all?  I mean, it behaves like this on older kernels due to the EINVAL
error.

I'm not yet decided if we'll need a separate system call wrapper for
cloning on the same stack, for safe(r) use with vfork.  If we do, we'll
have to teach the wrapper about SYSCALL_CHECK_FLAGS, too, I think.  If
a call with SYSCALL_CHECK_FLAGS and all supported flags performs the
requested clone action, that would be easier for us.  But I don't know
if that makes a convenient interface for programmers.

Thanks,
Florian
Christian Brauner Oct. 11, 2019, 10:46 a.m. UTC | #13
On Thu, Oct 10, 2019 at 04:49:26PM +0200, Florian Weimer wrote:
> * Christian Brauner:
> 
> > Ok, I need to be annoying once more. (As if there wasn't enough of that
> > already...)
> > So kernel-side a good friend of mine - Aleksa - and I are on the way to
> > push through _some_ standards... That very much includes how we check
> > for new extensions in syscalls. So we just discussed my earlier proposal
> > (Sorry for the lag the distance Australia to Germany makes things a bit
> > difficult.). One problem we'll have with adding a new member is that we
> > arguably need one for each additional flag member we add. That means
> > we'd need:
> >
> > struct clone_args /* initial struct */ {
> > 	__aligned_u64 flags;
> > 	__aligned_u64 supported_flags;
> > }
> >
> > and later
> >
> > struct clone_args /* extended struct: new flags argument added */ {
> > 	__aligned_u64 flags;
> > 	__aligned_u64 supported_flags;
> > 	__aligned_u64 flags2;
> > 	__aligned_u64 supported_flags2;
> > }
> >
> > which is not ideal. So we were thinking about a different protocol for
> > syscalls which embedd flags arguments in structs.
> 
> Is that really a bad problem?

Kernel people don't like wasting space so yes... :)
In other words, we need to come up with something that we can convince
both sides is useful.

> 
> > We'd introduce a new flag
> >
> > #define SYSCALL_CHECK_FLAGS ((1 << 63)ULL)
> >
> > that can be used by e.g. clone3() and openat2(). Then you can call the
> > syscall with:
> >
> > struct clone_args args = {
> > 	.flags |= SYSCALL_CHECK_FLAGS,
> > };
> > pid = clone3(&args, sizeof(args));
> >
> > /* 
> >  * The kernel now sets all flags it supports in _all_ flags arguments
> >  * present in the struct.
> >  */
> >
> > SYSCALL_CHECK_FLAGS is only valid in the initial flags argument not in
> > any added flags arguments.
> > If SYSCALL_CHECK_FLAGS is not supported then you'll get EINVAL and the
> > SYSCALL_CHECK_FLAGS flag is still present in the flags argument of the
> > struct. If it is supported it'll be masked out and all supported flag
> > bits will be set by the kernel in _all_ flag arguments that are present
> > in the structure.
> > Florian, thoughts?
> 
> Will SYSCALL_CHECK_FLAGS be for probing only and not perform any action
> at all?  I mean, it behaves like this on older kernels due to the EINVAL
> error.

Right, that's a good question.
I think we want it to only report features.
I'll need to think about this for a bit.
If this flag doesn't cause any action then we need to report a
meaningful error to the caller.

> 
> I'm not yet decided if we'll need a separate system call wrapper for
> cloning on the same stack, for safe(r) use with vfork.  If we do, we'll
> have to teach the wrapper about SYSCALL_CHECK_FLAGS, too, I think.  If
> a call with SYSCALL_CHECK_FLAGS and all supported flags performs the
> requested clone action, that would be easier for us.  But I don't know
> if that makes a convenient interface for programmers.

Phew... That's a hard decision to make but it feels wrong to mix these
semantics.

Christian
Christian Brauner Dec. 4, 2019, 1:16 p.m. UTC | #14
On Tue, Oct 08, 2019 at 03:44:17PM +0200, Christian Brauner wrote:
> Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL.
> Mutually exclusive with CLONE_SIGHAND to not disturb other threads
> signal handler.

Hey,

Just a heads up that Linus pulled CLONE_CLEAR_SIGHAND with other changes
for v5.5:

clone3: add CLONE_CLEAR_SIGHAND:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b612e5df4587c934bd056bf05f4a1deca4de4f75

tests: test CLONE_CLEAR_SIGHAND
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de5287235631cc561716d85f984614ef9598a5cc

	Christian
Florian Weimer Dec. 4, 2019, 1:45 p.m. UTC | #15
* Christian Brauner:

> On Tue, Oct 08, 2019 at 03:44:17PM +0200, Christian Brauner wrote:
>> Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL.
>> Mutually exclusive with CLONE_SIGHAND to not disturb other threads
>> signal handler.
>
> Hey,
>
> Just a heads up that Linus pulled CLONE_CLEAR_SIGHAND with other changes
> for v5.5:
>
> clone3: add CLONE_CLEAR_SIGHAND:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b612e5df4587c934bd056bf05f4a1deca4de4f75
>
> tests: test CLONE_CLEAR_SIGHAND
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de5287235631cc561716d85f984614ef9598a5cc

Very nice, thanks.

Florian
diff mbox series

Patch

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 99335e1f4a27..e3bc43efbbba 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -33,6 +33,9 @@ 
 #define CLONE_NEWNET		0x40000000	/* New network namespace */
 #define CLONE_IO		0x80000000	/* Clone io context */
 
+/* Flags for the clone3() syscall */
+#define CLONE3_RESET_SIGHAND 0x100000000ULL /* Reset any signal handler which is not SIG_IGN or SIG_DFL to SIG_DFL. */
+
 #ifndef __ASSEMBLY__
 /**
  * struct clone_args - arguments for the clone3 syscall
diff --git a/kernel/fork.c b/kernel/fork.c
index 1f6c45f6a734..3bced4a2931e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1516,6 +1516,9 @@  static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 	refcount_set(&sig->count, 1);
 	spin_lock_irq(&current->sighand->siglock);
 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
+	/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
+	if (clone_flags & CLONE3_RESET_SIGHAND)
+		flush_signal_handlers(tsk, 0);
 	spin_unlock_irq(&current->sighand->siglock);
 	return 0;
 }
@@ -2567,7 +2570,7 @@  static bool clone3_args_valid(const struct kernel_clone_args *kargs)
 	 * All lower bits of the flag word are taken.
 	 * Verify that no other unknown flags are passed along.
 	 */
-	if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+	if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE3_RESET_SIGNALS))
 		return false;
 
 	/*
@@ -2577,6 +2580,9 @@  static bool clone3_args_valid(const struct kernel_clone_args *kargs)
 	if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
 		return false;
 
+	if (kargs->flags & (CLONE_SIGHAND | CLONE3_RESET_SIGHAND))
+		return false;
+
 	if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
 	    kargs->exit_signal)
 		return false;