diff mbox series

[RFC,v3,03/23] sysdeps/wait: Use waitid if avaliable

Message ID 2d3359a195633b85e01f83bf536330d72f7bc8aa.1563321715.git.alistair.francis@wdc.com
State New
Headers show
Series [RFC,v3,01/23] sysdeps/nanosleep: Use clock_nanosleep_time64 if avaliable | expand

Commit Message

Alistair Francis July 17, 2019, 12:08 a.m. UTC
If the waitid syscall is avaliable let's use that as waitpid
and wait4 aren't always avaliable (they aren't avaliable on RV32).

Unfortunately waitid is substantially differnt to waitpid and wait4, so
the conversion ends up being complex.

Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
---
 ChangeLog                                  |  3 ++
 sysdeps/unix/sysv/linux/wait.c             | 39 ++++++++++++++++--
 sysdeps/unix/sysv/linux/waitpid.c          | 46 ++++++++++++++++++++++
 sysdeps/unix/sysv/linux/waitpid_nocancel.c | 45 +++++++++++++++++++++
 4 files changed, 130 insertions(+), 3 deletions(-)

Comments

Florian Weimer July 17, 2019, 5:31 a.m. UTC | #1
* Alistair Francis:

> +#ifdef __NR_waitid

I wonder if the condition should be

  #ifndef __NR_wait4

etc.  That would make it less risky for the existing architectures.

> +    case CLD_DUMPED:
> +      *stat_loc = 0x80;
> +    case CLD_KILLED:
> +      *stat_loc |= infop.si_status;
> +      break;

The kernel does this (in kernel/exit.c):

        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
        wo->wo_stat = status;
…
        if (infop) {
                if ((status & 0x7f) == 0) {
                        infop->cause = CLD_EXITED;
                        infop->status = status >> 8;
                } else {
                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
                        infop->status = status & 0x7f;
                }
                infop->pid = pid;
                infop->uid = uid;
        }

Therefore, I wonder if you need to propagate the lower bits from
si_status for CLD_DUMPED, too.

For wait/wait3/waitpid, you could probably use the implementations for
sysdeps/unix/bsd, layered on top of wait4.

The patch also needs some formatting tweaks regarding the placement of
braces, and there are missing spaces before parentheses.

Thanks,
Florian
Alistair Francis July 19, 2019, 5:49 p.m. UTC | #2
On Tue, Jul 16, 2019 at 10:31 PM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Alistair Francis:
>
> > +#ifdef __NR_waitid
>
> I wonder if the condition should be
>
>   #ifndef __NR_wait4
>
> etc.  That would make it less risky for the existing architectures.

Seems fair, I have updated that.

>
> > +    case CLD_DUMPED:
> > +      *stat_loc = 0x80;
> > +    case CLD_KILLED:
> > +      *stat_loc |= infop.si_status;
> > +      break;
>
> The kernel does this (in kernel/exit.c):
>
>         status = (p->signal->flags & SIGNAL_GROUP_EXIT)
>                 ? p->signal->group_exit_code : p->exit_code;
>         wo->wo_stat = status;
> …
>         if (infop) {
>                 if ((status & 0x7f) == 0) {
>                         infop->cause = CLD_EXITED;
>                         infop->status = status >> 8;
>                 } else {
>                         infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
>                         infop->status = status & 0x7f;
>                 }
>                 infop->pid = pid;
>                 infop->uid = uid;
>         }
>
> Therefore, I wonder if you need to propagate the lower bits from
> si_status for CLD_DUMPED, too.

We are propagating the bits due to a fall through in the switch
statement, I have added a comment to indicate that we are falling
through.

>
> For wait/wait3/waitpid, you could probably use the implementations for
> sysdeps/unix/bsd, layered on top of wait4.

I'm not sure what you mean here.

>
> The patch also needs some formatting tweaks regarding the placement of
> braces, and there are missing spaces before parentheses.

Yep, I have tried to fixup all the formatting issues.

Alistair

>
> Thanks,
> Florian
Rich Felker July 21, 2019, 4:03 a.m. UTC | #3
On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
> If the waitid syscall is avaliable let's use that as waitpid
> and wait4 aren't always avaliable (they aren't avaliable on RV32).
> 
> Unfortunately waitid is substantially differnt to waitpid and wait4, so
> the conversion ends up being complex.
> 
> Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> ---
>  ChangeLog                                  |  3 ++
>  sysdeps/unix/sysv/linux/wait.c             | 39 ++++++++++++++++--
>  sysdeps/unix/sysv/linux/waitpid.c          | 46 ++++++++++++++++++++++
>  sysdeps/unix/sysv/linux/waitpid_nocancel.c | 45 +++++++++++++++++++++
>  4 files changed, 130 insertions(+), 3 deletions(-)
> [...]
>  
>  weak_alias (__libc_wait, __wait)
> diff --git a/sysdeps/unix/sysv/linux/waitpid.c b/sysdeps/unix/sysv/linux/waitpid.c
> index f0897574c0..7d4e0bb77d 100644
> --- a/sysdeps/unix/sysv/linux/waitpid.c
> +++ b/sysdeps/unix/sysv/linux/waitpid.c
> @@ -20,12 +20,58 @@
>  #include <sysdep-cancel.h>
>  #include <stdlib.h>
>  #include <sys/wait.h>
> +#include <unistd.h>
>  
>  __pid_t
>  __waitpid (__pid_t pid, int *stat_loc, int options)
>  {
>  #ifdef __NR_waitpid
>    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
> +#elif defined(__NR_waitid)
> +  __pid_t ret;
> +  idtype_t idtype = P_PID;
> +  siginfo_t infop;
> +
> +  if (pid < -1) {
> +    idtype = P_PGID;
> +    pid *= -1;
> +  } else if (pid == -1) {
> +    idtype = P_ALL;
> +  } else if (pid == 0) {
> +    idtype = P_PGID;
> +    pid = getpgrp();
> +  }
> +
> +  options |= WEXITED;
> +
> +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);

This emulation has a fundamental race condition. Between getpgrp and
waitid, a signal handler may perform setpgrp, setsid, and/or fork in
ways that cause the wrong pgid to be passed to the waitid syscall.
There is no way around this because you cannot block signals for the
interval, since signals must be able to interrupt the waitid syscall.

Unless there's some trick I'm missing here, the kernel folks' removal
of the wait4 syscall is just a bug in the kernel that they need to
fix. It also makes it impossible to implement the wait4 function,
since there's no way to get rusage for the exited process.

Rich
Rich Felker July 21, 2019, 4:20 a.m. UTC | #4
On Sun, Jul 21, 2019 at 12:03:10AM -0400, Rich Felker wrote:
> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
> > If the waitid syscall is avaliable let's use that as waitpid
> > and wait4 aren't always avaliable (they aren't avaliable on RV32).
> > 
> > Unfortunately waitid is substantially differnt to waitpid and wait4, so
> > the conversion ends up being complex.
> > 
> > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> > ---
> >  ChangeLog                                  |  3 ++
> >  sysdeps/unix/sysv/linux/wait.c             | 39 ++++++++++++++++--
> >  sysdeps/unix/sysv/linux/waitpid.c          | 46 ++++++++++++++++++++++
> >  sysdeps/unix/sysv/linux/waitpid_nocancel.c | 45 +++++++++++++++++++++
> >  4 files changed, 130 insertions(+), 3 deletions(-)
> > [...]
> >  
> >  weak_alias (__libc_wait, __wait)
> > diff --git a/sysdeps/unix/sysv/linux/waitpid.c b/sysdeps/unix/sysv/linux/waitpid.c
> > index f0897574c0..7d4e0bb77d 100644
> > --- a/sysdeps/unix/sysv/linux/waitpid.c
> > +++ b/sysdeps/unix/sysv/linux/waitpid.c
> > @@ -20,12 +20,58 @@
> >  #include <sysdep-cancel.h>
> >  #include <stdlib.h>
> >  #include <sys/wait.h>
> > +#include <unistd.h>
> >  
> >  __pid_t
> >  __waitpid (__pid_t pid, int *stat_loc, int options)
> >  {
> >  #ifdef __NR_waitpid
> >    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
> > +#elif defined(__NR_waitid)
> > +  __pid_t ret;
> > +  idtype_t idtype = P_PID;
> > +  siginfo_t infop;
> > +
> > +  if (pid < -1) {
> > +    idtype = P_PGID;
> > +    pid *= -1;
> > +  } else if (pid == -1) {
> > +    idtype = P_ALL;
> > +  } else if (pid == 0) {
> > +    idtype = P_PGID;
> > +    pid = getpgrp();
> > +  }
> > +
> > +  options |= WEXITED;
> > +
> > +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);
> 
> This emulation has a fundamental race condition. Between getpgrp and
> waitid, a signal handler may perform setpgrp, setsid, and/or fork in
> ways that cause the wrong pgid to be passed to the waitid syscall.
> There is no way around this because you cannot block signals for the
> interval, since signals must be able to interrupt the waitid syscall.
> 
> Unless there's some trick I'm missing here, the kernel folks' removal
> of the wait4 syscall is just a bug in the kernel that they need to
> fix. It also makes it impossible to implement the wait4 function,
> since there's no way to get rusage for the exited process.

Reportedly (via Stefan O'Rear just now) there was a similar kernel bug
introduced in 161550d74c07303ffa6187ba776f62df5a906a21 that makes
wait4 fail to honor pgrp changes that happen while already in the
syscall (e.g. performed on the caller by another thread or even
another process). But the race condition here in userspace is even
more egregious I think, since it violates the contract in a case where
there is a clear observable order between the pgrp change and the
blocking wait -- for instance, the signal handler could change pgrp
of itself and a child process, and then whether or not the signal
handler had executed before the waitpid, the waitpid should catch the
child's exit. But with the above race, it fails to.

Rich
Arnd Bergmann July 21, 2019, 7:57 a.m. UTC | #5
On Sun, Jul 21, 2019 at 6:03 AM Rich Felker <dalias@libc.org> wrote:
> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
> >  #ifdef __NR_waitpid
> >    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
> > +#elif defined(__NR_waitid)
> > +  __pid_t ret;
> > +  idtype_t idtype = P_PID;
> > +  siginfo_t infop;
> > +
> > +  if (pid < -1) {
> > +    idtype = P_PGID;
> > +    pid *= -1;
> > +  } else if (pid == -1) {
> > +    idtype = P_ALL;
> > +  } else if (pid == 0) {
> > +    idtype = P_PGID;
> > +    pid = getpgrp();
> > +  }
> > +
> > +  options |= WEXITED;
> > +
> > +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);
>
> This emulation has a fundamental race condition. Between getpgrp and
> waitid, a signal handler may perform setpgrp, setsid, and/or fork in
> ways that cause the wrong pgid to be passed to the waitid syscall.
> There is no way around this because you cannot block signals for the
> interval, since signals must be able to interrupt the waitid syscall.

Interesting, I don't think anyone ever considered this race, as waitid()
was clearly always /intended/ as a superset of wait(), waitpid(), wait3()
and wait4(), as Roland McGrath described in the initial creation
of the waitid() syscall, see
https://lore.kernel.org/lkml/200408152303.i7FN3Vc3021030@magilla.sf.frob.com/

I originally planned to have a new waitid_time64() syscall to replace
them in the y2038 series, but that got deferred, so at the moment
the older 32-bit architectures don't have any changes for wait*(), and
riscv32 just removed wait4() but kept waitid().

It would be easy enough to restore wait4() for riscv32, or we could
decide to extend waitid() in a backward-compatible way to close
this race, e.g. by allowing P_PGID with pid=0 (this is currently
-EINVAL),  by adding a new P_SAME_PGID macro to cover that
case, or by creating a new waitid replacement that does other things
as well (nanosecond ru_*time, pidfd, ...).

Adding a few more kernel developers that may have an opinion on this.

> Unless there's some trick I'm missing here, the kernel folks' removal
> of the wait4 syscall is just a bug in the kernel that they need to
> fix. It also makes it impossible to implement the wait4 function,
> since there's no way to get rusage for the exited process.

The rusage is passed back from the kernel waitid() as an extension
to the posix interface, in order to allow implementing wait4().

      Arnd
Eric W. Biederman July 21, 2019, 11:59 a.m. UTC | #6
Rich Felker <dalias@libc.org> writes:

> On Sun, Jul 21, 2019 at 12:03:10AM -0400, Rich Felker wrote:
>> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
>> > If the waitid syscall is avaliable let's use that as waitpid
>> > and wait4 aren't always avaliable (they aren't avaliable on RV32).
>> > 
>> > Unfortunately waitid is substantially differnt to waitpid and wait4, so
>> > the conversion ends up being complex.
>> > 
>> > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
>> > ---
>> >  ChangeLog                                  |  3 ++
>> >  sysdeps/unix/sysv/linux/wait.c             | 39 ++++++++++++++++--
>> >  sysdeps/unix/sysv/linux/waitpid.c          | 46 ++++++++++++++++++++++
>> >  sysdeps/unix/sysv/linux/waitpid_nocancel.c | 45 +++++++++++++++++++++
>> >  4 files changed, 130 insertions(+), 3 deletions(-)
>> > [...]
>> >  
>> >  weak_alias (__libc_wait, __wait)
>> > diff --git a/sysdeps/unix/sysv/linux/waitpid.c b/sysdeps/unix/sysv/linux/waitpid.c
>> > index f0897574c0..7d4e0bb77d 100644
>> > --- a/sysdeps/unix/sysv/linux/waitpid.c
>> > +++ b/sysdeps/unix/sysv/linux/waitpid.c
>> > @@ -20,12 +20,58 @@
>> >  #include <sysdep-cancel.h>
>> >  #include <stdlib.h>
>> >  #include <sys/wait.h>
>> > +#include <unistd.h>
>> >  
>> >  __pid_t
>> >  __waitpid (__pid_t pid, int *stat_loc, int options)
>> >  {
>> >  #ifdef __NR_waitpid
>> >    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
>> > +#elif defined(__NR_waitid)
>> > +  __pid_t ret;
>> > +  idtype_t idtype = P_PID;
>> > +  siginfo_t infop;
>> > +
>> > +  if (pid < -1) {
>> > +    idtype = P_PGID;
>> > +    pid *= -1;
>> > +  } else if (pid == -1) {
>> > +    idtype = P_ALL;
>> > +  } else if (pid == 0) {
>> > +    idtype = P_PGID;
>> > +    pid = getpgrp();
>> > +  }
>> > +
>> > +  options |= WEXITED;
>> > +
>> > +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);
>> 
>> This emulation has a fundamental race condition. Between getpgrp and
>> waitid, a signal handler may perform setpgrp, setsid, and/or fork in
>> ways that cause the wrong pgid to be passed to the waitid syscall.
>> There is no way around this because you cannot block signals for the
>> interval, since signals must be able to interrupt the waitid syscall.
>> 
>> Unless there's some trick I'm missing here, the kernel folks' removal
>> of the wait4 syscall is just a bug in the kernel that they need to
>> fix. It also makes it impossible to implement the wait4 function,
>> since there's no way to get rusage for the exited process.
>
> Reportedly (via Stefan O'Rear just now) there was a similar kernel bug
> introduced in 161550d74c07303ffa6187ba776f62df5a906a21 that makes
> wait4 fail to honor pgrp changes that happen while already in the
> syscall (e.g. performed on the caller by another thread or even
> another process).

I could not find the report from Stefan O'Rear.

Does that result in actual problems for programs or is this a
theoretical race noticed upon code review?

> But the race condition here in userspace is even
> more egregious I think, since it violates the contract in a case where
> there is a clear observable order between the pgrp change and the
> blocking wait -- for instance, the signal handler could change pgrp
> of itself and a child process, and then whether or not the signal
> handler had executed before the waitpid, the waitpid should catch the
> child's exit. But with the above race, it fails to.

Definitely a bigger issue.  I believe posix requires waitpid is required
to be signal safe.

Eric
Eric W. Biederman July 21, 2019, 12:15 p.m. UTC | #7
Arnd Bergmann <arnd@arndb.de> writes:

> On Sun, Jul 21, 2019 at 6:03 AM Rich Felker <dalias@libc.org> wrote:
>> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
>> >  #ifdef __NR_waitpid
>> >    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
>> > +#elif defined(__NR_waitid)
>> > +  __pid_t ret;
>> > +  idtype_t idtype = P_PID;
>> > +  siginfo_t infop;
>> > +
>> > +  if (pid < -1) {
>> > +    idtype = P_PGID;
>> > +    pid *= -1;
>> > +  } else if (pid == -1) {
>> > +    idtype = P_ALL;
>> > +  } else if (pid == 0) {
>> > +    idtype = P_PGID;
>> > +    pid = getpgrp();
>> > +  }
>> > +
>> > +  options |= WEXITED;
>> > +
>> > +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);
>>
>> This emulation has a fundamental race condition. Between getpgrp and
>> waitid, a signal handler may perform setpgrp, setsid, and/or fork in
>> ways that cause the wrong pgid to be passed to the waitid syscall.
>> There is no way around this because you cannot block signals for the
>> interval, since signals must be able to interrupt the waitid syscall.
>
> Interesting, I don't think anyone ever considered this race, as waitid()
> was clearly always /intended/ as a superset of wait(), waitpid(), wait3()
> and wait4(), as Roland McGrath described in the initial creation
> of the waitid() syscall, see
> https://lore.kernel.org/lkml/200408152303.i7FN3Vc3021030@magilla.sf.frob.com/
>

It is definitley a problem as setpid and setpgrp and setsid are all
required to be signal safe, and this waitpid emulation is clearly not.

> I originally planned to have a new waitid_time64() syscall to replace
> them in the y2038 series, but that got deferred, so at the moment
> the older 32-bit architectures don't have any changes for wait*(), and
> riscv32 just removed wait4() but kept waitid().
>
> It would be easy enough to restore wait4() for riscv32, or we could
> decide to extend waitid() in a backward-compatible way to close
> this race, e.g. by allowing P_PGID with pid=0 (this is currently
> -EINVAL),  by adding a new P_SAME_PGID macro to cover that
> case, or by creating a new waitid replacement that does other things
> as well (nanosecond ru_*time, pidfd, ...).
>
> Adding a few more kernel developers that may have an opinion on this.

In a different reply it was mentioned that wait4 has had an issue since
2008, where it does not track changes in the current processes pgrp.

I would like to get resolution on if that is a real problem for anyone
or if it is a something theoretical found on code review, before we deal
with this issue.  Because tracking pgrp changes will require a deeper
code change.

My gut feel says either fix waitid (with P_SAME_PGID) or clearly document
why the kernel's waitid is not a replacement for wait4, and add wait4 to
riscv.

Would adding wait4 to riscv allow us to get rid of the rusage parameter
of the kernel's waitid?

I don't oppose a new syscall to take advantage of pidfd but I don't
think there is any need to tie the two fixes together so I would rather
keep them separate.  Just so we don't rush through fixing one to deal
with the other.

Eric
Christian Brauner July 21, 2019, 12:28 p.m. UTC | #8
On July 21, 2019 2:15:24 PM GMT+02:00, ebiederm@xmission.com wrote:
>Arnd Bergmann <arnd@arndb.de> writes:
>
>> On Sun, Jul 21, 2019 at 6:03 AM Rich Felker <dalias@libc.org> wrote:
>>> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
>>> >  #ifdef __NR_waitpid
>>> >    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
>>> > +#elif defined(__NR_waitid)
>>> > +  __pid_t ret;
>>> > +  idtype_t idtype = P_PID;
>>> > +  siginfo_t infop;
>>> > +
>>> > +  if (pid < -1) {
>>> > +    idtype = P_PGID;
>>> > +    pid *= -1;
>>> > +  } else if (pid == -1) {
>>> > +    idtype = P_ALL;
>>> > +  } else if (pid == 0) {
>>> > +    idtype = P_PGID;
>>> > +    pid = getpgrp();
>>> > +  }
>>> > +
>>> > +  options |= WEXITED;
>>> > +
>>> > +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options,
>NULL);
>>>
>>> This emulation has a fundamental race condition. Between getpgrp and
>>> waitid, a signal handler may perform setpgrp, setsid, and/or fork in
>>> ways that cause the wrong pgid to be passed to the waitid syscall.
>>> There is no way around this because you cannot block signals for the
>>> interval, since signals must be able to interrupt the waitid
>syscall.
>>
>> Interesting, I don't think anyone ever considered this race, as
>waitid()
>> was clearly always /intended/ as a superset of wait(), waitpid(),
>wait3()
>> and wait4(), as Roland McGrath described in the initial creation
>> of the waitid() syscall, see
>>
>https://lore.kernel.org/lkml/200408152303.i7FN3Vc3021030@magilla.sf.frob.com/
>>
>
>It is definitley a problem as setpid and setpgrp and setsid are all
>required to be signal safe, and this waitpid emulation is clearly not.
>
>> I originally planned to have a new waitid_time64() syscall to replace
>> them in the y2038 series, but that got deferred, so at the moment
>> the older 32-bit architectures don't have any changes for wait*(),
>and
>> riscv32 just removed wait4() but kept waitid().
>>
>> It would be easy enough to restore wait4() for riscv32, or we could
>> decide to extend waitid() in a backward-compatible way to close
>> this race, e.g. by allowing P_PGID with pid=0 (this is currently
>> -EINVAL),  by adding a new P_SAME_PGID macro to cover that
>> case, or by creating a new waitid replacement that does other things
>> as well (nanosecond ru_*time, pidfd, ...).
>>
>> Adding a few more kernel developers that may have an opinion on this.
>
>In a different reply it was mentioned that wait4 has had an issue since
>2008, where it does not track changes in the current processes pgrp.
>
>I would like to get resolution on if that is a real problem for anyone
>or if it is a something theoretical found on code review, before we
>deal
>with this issue.  Because tracking pgrp changes will require a deeper
>code change.
>
>My gut feel says either fix waitid (with P_SAME_PGID) or clearly
>document
>why the kernel's waitid is not a replacement for wait4, and add wait4
>to
>riscv.
>
>Would adding wait4 to riscv allow us to get rid of the rusage parameter
>of the kernel's waitid?
>
>I don't oppose a new syscall to take advantage of pidfd but I don't
>think there is any need to tie the two fixes together so I would rather
>keep them separate.  Just so we don't rush through fixing one to deal
>with the other.
>
>Eric

Fwiw, I have a patchset for pidfd_wait ready.
But I strongly oppose mixing pids and pidfds.
That syscall will only handle pidfds.

Christian
Arnd Bergmann July 21, 2019, 2:30 p.m. UTC | #9
On Sun, Jul 21, 2019 at 2:15 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
> Arnd Bergmann <arnd@arndb.de> writes:
> > On Sun, Jul 21, 2019 at 6:03 AM Rich Felker <dalias@libc.org> wrote:
> >> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
> >> >  #ifdef __NR_waitpid
> >> >    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
> >> > +#elif defined(__NR_waitid)
> >> > +  __pid_t ret;
> >> > +  idtype_t idtype = P_PID;
> >> > +  siginfo_t infop;
> >> > +
> >> > +  if (pid < -1) {
> >> > +    idtype = P_PGID;
> >> > +    pid *= -1;
> >> > +  } else if (pid == -1) {
> >> > +    idtype = P_ALL;
> >> > +  } else if (pid == 0) {
> >> > +    idtype = P_PGID;
> >> > +    pid = getpgrp();
> >> > +  }
> >> > +
> >> > +  options |= WEXITED;
> >> > +
> >> > +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);
> >>
> >> This emulation has a fundamental race condition. Between getpgrp and
> >> waitid, a signal handler may perform setpgrp, setsid, and/or fork in
> >> ways that cause the wrong pgid to be passed to the waitid syscall.
> >> There is no way around this because you cannot block signals for the
> >> interval, since signals must be able to interrupt the waitid syscall.
> >
> > Interesting, I don't think anyone ever considered this race, as waitid()
> > was clearly always /intended/ as a superset of wait(), waitpid(), wait3()
> > and wait4(), as Roland McGrath described in the initial creation
> > of the waitid() syscall, see
> > https://lore.kernel.org/lkml/200408152303.i7FN3Vc3021030@magilla.sf.frob.com/
> >
>
> It is definitley a problem as setpid and setpgrp and setsid are all
> required to be signal safe, and this waitpid emulation is clearly not.

Ok.

> Would adding wait4 to riscv allow us to get rid of the rusage parameter
> of the kernel's waitid?

No, the kernel waitid() behavior is widely documented, and there are
likely to be applications that rely on it across architectures.

The only downside I can think of for adding wait4() is that it also
uses 32-bit time_t in its rusage, so if we create a time64 version
of rusage, we now have to add three more system calls (waitid,
getrusage, and wait4) instead of just two.

> I don't oppose a new syscall to take advantage of pidfd but I don't
> think there is any need to tie the two fixes together so I would rather
> keep them separate.  Just so we don't rush through fixing one to deal
> with the other.

I see at multiple independent issues with the current waitid():

- The race that Rich pointed out
- The lack of pidfd support
- The usage of a 32-bit timeval structure that is incompatible with
  the libc definition on rv32 and other 32-bit architectures with
  a y2038-safe libc (there is no y2038 overflow in rusage itself, but
  it requires an ugly wrapper to convert the structure)
- The lack of nanosecond resolution in rusage that someone asked for
- When we last talked about it, there was some debate over replacing
  siginfo_t with a different structure.

We don't have to address each one of those in a single new syscall
(or even at all), but addressing them one at a time would risk adding
even more system calls to do essentially just one thing that already
has at least five versions at the libc level (wait, waitpid, wait3, wait4
and waitid).

      Arnd
Eric W. Biederman July 21, 2019, 3:45 p.m. UTC | #10
Arnd Bergmann <arnd@arndb.de> writes:

> On Sun, Jul 21, 2019 at 2:15 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
>> Arnd Bergmann <arnd@arndb.de> writes:
>> > On Sun, Jul 21, 2019 at 6:03 AM Rich Felker <dalias@libc.org> wrote:
>> >> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
>> >> >  #ifdef __NR_waitpid
>> >> >    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
>> >> > +#elif defined(__NR_waitid)
>> >> > +  __pid_t ret;
>> >> > +  idtype_t idtype = P_PID;
>> >> > +  siginfo_t infop;
>> >> > +
>> >> > +  if (pid < -1) {
>> >> > +    idtype = P_PGID;
>> >> > +    pid *= -1;
>> >> > +  } else if (pid == -1) {
>> >> > +    idtype = P_ALL;
>> >> > +  } else if (pid == 0) {
>> >> > +    idtype = P_PGID;
>> >> > +    pid = getpgrp();
>> >> > +  }
>> >> > +
>> >> > +  options |= WEXITED;
>> >> > +
>> >> > +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);
>> >>
>> >> This emulation has a fundamental race condition. Between getpgrp and
>> >> waitid, a signal handler may perform setpgrp, setsid, and/or fork in
>> >> ways that cause the wrong pgid to be passed to the waitid syscall.
>> >> There is no way around this because you cannot block signals for the
>> >> interval, since signals must be able to interrupt the waitid syscall.
>> >
>> > Interesting, I don't think anyone ever considered this race, as waitid()
>> > was clearly always /intended/ as a superset of wait(), waitpid(), wait3()
>> > and wait4(), as Roland McGrath described in the initial creation
>> > of the waitid() syscall, see
>> > https://lore.kernel.org/lkml/200408152303.i7FN3Vc3021030@magilla.sf.frob.com/
>> >
>>
>> It is definitley a problem as setpid and setpgrp and setsid are all
>> required to be signal safe, and this waitpid emulation is clearly not.
>
> Ok.
>
>> Would adding wait4 to riscv allow us to get rid of the rusage parameter
>> of the kernel's waitid?
>
> No, the kernel waitid() behavior is widely documented, and there are
> likely to be applications that rely on it across architectures.
>
> The only downside I can think of for adding wait4() is that it also
> uses 32-bit time_t in its rusage, so if we create a time64 version
> of rusage, we now have to add three more system calls (waitid,
> getrusage, and wait4) instead of just two.
>
>> I don't oppose a new syscall to take advantage of pidfd but I don't
>> think there is any need to tie the two fixes together so I would rather
>> keep them separate.  Just so we don't rush through fixing one to deal
>> with the other.
>
> I see at multiple independent issues with the current waitid():
>
> - The race that Rich pointed out
> - The lack of pidfd support
> - The usage of a 32-bit timeval structure that is incompatible with
>   the libc definition on rv32 and other 32-bit architectures with
>   a y2038-safe libc (there is no y2038 overflow in rusage itself, but
>   it requires an ugly wrapper to convert the structure)
> - The lack of nanosecond resolution in rusage that someone asked for
> - When we last talked about it, there was some debate over replacing
>   siginfo_t with a different structure.
>
> We don't have to address each one of those in a single new syscall
> (or even at all), but addressing them one at a time would risk adding
> even more system calls to do essentially just one thing that already
> has at least five versions at the libc level (wait, waitpid, wait3, wait4
> and waitid).

For the particular issue of 32bit riscv needing a working wait4 I see
one of two possibilities.

We either add P_PROCESS_PGID to the kernel's waitid or we add wait4.

I am leaning towards P_PROCESS_PGID as this is the tiny little bit
needed to make the kernel's waitid the one call that combines them all,
that it already tries to be.  Plus P_PROCESS_PGID or the equivalent in
the kernel internal version is needed if we choose to have wait4 handle
the process group id changing while wait4 is running.

Eric
Arnd Bergmann July 21, 2019, 5:05 p.m. UTC | #11
On Sun, Jul 21, 2019 at 5:45 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
> Arnd Bergmann <arnd@arndb.de> writes:
> > On Sun, Jul 21, 2019 at 2:15 PM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >> Arnd Bergmann <arnd@arndb.de> writes:
> >> > On Sun, Jul 21, 2019 at 6:03 AM Rich Felker <dalias@libc.org> wrote:
> >> >> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
> > - The race that Rich pointed out
> > - The lack of pidfd support
> > - The usage of a 32-bit timeval structure that is incompatible with
> >   the libc definition on rv32 and other 32-bit architectures with
> >   a y2038-safe libc (there is no y2038 overflow in rusage itself, but
> >   it requires an ugly wrapper to convert the structure)
> > - The lack of nanosecond resolution in rusage that someone asked for
> > - When we last talked about it, there was some debate over replacing
> >   siginfo_t with a different structure.
> >
> > We don't have to address each one of those in a single new syscall
> > (or even at all), but addressing them one at a time would risk adding
> > even more system calls to do essentially just one thing that already
> > has at least five versions at the libc level (wait, waitpid, wait3, wait4
> > and waitid).
>
> For the particular issue of 32bit riscv needing a working wait4 I see
> one of two possibilities.
>
> We either add P_PROCESS_PGID to the kernel's waitid or we add wait4.
>
> I am leaning towards P_PROCESS_PGID as this is the tiny little bit
> needed to make the kernel's waitid the one call that combines them all,
> that it already tries to be.  Plus P_PROCESS_PGID or the equivalent in
> the kernel internal version is needed if we choose to have wait4 handle
> the process group id changing while wait4 is running.

Yes, that sounds good to me, both as a short-term and as a long-term
solution for the race, on all architectures. It would help if we could
have some extra review on the wait4() and waitpid() implementation
around waitid(). I first implemented this in my prototype for a y2038-safe
musl port, and Alistair write a corresponding version for glibc.

I did make sure that it passes LTP (and that found a number of bugs
in my first attempt), but it would still be good to know that the P_PGID
issue is the only problem in the underlying system call, aside from
any remaining bugs in the wrapper implementation.

        Arnd
Linus Torvalds July 21, 2019, 5:16 p.m. UTC | #12
On Sun, Jul 21, 2019 at 8:45 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
> We either add P_PROCESS_PGID to the kernel's waitid or we add wait4.

Do we actually need a new flag?

Why not just allow people to pass in pid = 0 with the existing P_PGID flag?

Those are the traditional and documented waitpid() semantics. Why
wouldn't we use those semantics for waitid() too?

And since our current (broken) waitid() returns EINVAL for that case,
it's even trivial to test for in user space (and fall back on the
broken racy code in user space - which you have to do anyway).

Honestly, that seems like the simplest soilution, but also like the
only sane model. The fact that P_PGID with a zero pid doesn't work
seems to simply be a bug right now, and keeps waitid() from being a
proper superset of waitpid().

              Linus
Eric W. Biederman July 21, 2019, 9:40 p.m. UTC | #13
Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Sun, Jul 21, 2019 at 8:45 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>>
>> We either add P_PROCESS_PGID to the kernel's waitid or we add wait4.
>
> Do we actually need a new flag?
>
> Why not just allow people to pass in pid = 0 with the existing P_PGID flag?
>
> Those are the traditional and documented waitpid() semantics. Why
> wouldn't we use those semantics for waitid() too?
>
> And since our current (broken) waitid() returns EINVAL for that case,
> it's even trivial to test for in user space (and fall back on the
> broken racy code in user space - which you have to do anyway).

Our current waitid implementation when serving as waitid is not broken.

What is broken is the waitpid emulation on top of waitid.

The way waitid is defined waitid P_PGID with a pid of 0 means
wait for the process group id of 0.  Which in theory and in limited
practice exists.  AKA that is the pid and process group of the idle
thread.

Further there is the outstanding question.  Should P_PROCESS_PGID refer
to the curent processes process group id at the time of the waitid call,
or should P_PROCESS_PGID refer to the process group id when a child is
found?

It has been suggested elswehere in this conversation that 161550d74c07
("pid: sys_wait... fixes") may have introduced a regression.  If so
the current wait4 behavior of capturing the process group at the time
of call is wrong and needs to be fixed.

Not capuring the pid at time time of call is a very different behavior
of P_PROCESS_PGID vs all of the other waitid cases which do have the pid
fixed at the time of the call which further argues a separate flag
because the behavior would be distinctly different.

> Honestly, that seems like the simplest soilution, but also like the
> only sane model. The fact that P_PGID with a zero pid doesn't work
> seems to simply be a bug right now, and keeps waitid() from being a
> proper superset of waitpid().

In the context of waitid it does not look sane.  Unlike the other wait
functions waitid does not have any magic ids and by having an idtype
field makes that kind of magic unnecessary.  Further it is trivial
to add one line to the architecture independent enumeration and have 0
risk of confusion or of regressing user space.

Eric
Rich Felker July 21, 2019, 10:59 p.m. UTC | #14
On Sun, Jul 21, 2019 at 06:59:09AM -0500, Eric W. Biederman wrote:
> Rich Felker <dalias@libc.org> writes:
> 
> > On Sun, Jul 21, 2019 at 12:03:10AM -0400, Rich Felker wrote:
> >> On Tue, Jul 16, 2019 at 05:08:48PM -0700, Alistair Francis wrote:
> >> > If the waitid syscall is avaliable let's use that as waitpid
> >> > and wait4 aren't always avaliable (they aren't avaliable on RV32).
> >> > 
> >> > Unfortunately waitid is substantially differnt to waitpid and wait4, so
> >> > the conversion ends up being complex.
> >> > 
> >> > Signed-off-by: Alistair Francis <alistair.francis@wdc.com>
> >> > ---
> >> >  ChangeLog                                  |  3 ++
> >> >  sysdeps/unix/sysv/linux/wait.c             | 39 ++++++++++++++++--
> >> >  sysdeps/unix/sysv/linux/waitpid.c          | 46 ++++++++++++++++++++++
> >> >  sysdeps/unix/sysv/linux/waitpid_nocancel.c | 45 +++++++++++++++++++++
> >> >  4 files changed, 130 insertions(+), 3 deletions(-)
> >> > [...]
> >> >  
> >> >  weak_alias (__libc_wait, __wait)
> >> > diff --git a/sysdeps/unix/sysv/linux/waitpid.c b/sysdeps/unix/sysv/linux/waitpid.c
> >> > index f0897574c0..7d4e0bb77d 100644
> >> > --- a/sysdeps/unix/sysv/linux/waitpid.c
> >> > +++ b/sysdeps/unix/sysv/linux/waitpid.c
> >> > @@ -20,12 +20,58 @@
> >> >  #include <sysdep-cancel.h>
> >> >  #include <stdlib.h>
> >> >  #include <sys/wait.h>
> >> > +#include <unistd.h>
> >> >  
> >> >  __pid_t
> >> >  __waitpid (__pid_t pid, int *stat_loc, int options)
> >> >  {
> >> >  #ifdef __NR_waitpid
> >> >    return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
> >> > +#elif defined(__NR_waitid)
> >> > +  __pid_t ret;
> >> > +  idtype_t idtype = P_PID;
> >> > +  siginfo_t infop;
> >> > +
> >> > +  if (pid < -1) {
> >> > +    idtype = P_PGID;
> >> > +    pid *= -1;
> >> > +  } else if (pid == -1) {
> >> > +    idtype = P_ALL;
> >> > +  } else if (pid == 0) {
> >> > +    idtype = P_PGID;
> >> > +    pid = getpgrp();
> >> > +  }
> >> > +
> >> > +  options |= WEXITED;
> >> > +
> >> > +  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);
> >> 
> >> This emulation has a fundamental race condition. Between getpgrp and
> >> waitid, a signal handler may perform setpgrp, setsid, and/or fork in
> >> ways that cause the wrong pgid to be passed to the waitid syscall.
> >> There is no way around this because you cannot block signals for the
> >> interval, since signals must be able to interrupt the waitid syscall.
> >> 
> >> Unless there's some trick I'm missing here, the kernel folks' removal
> >> of the wait4 syscall is just a bug in the kernel that they need to
> >> fix. It also makes it impossible to implement the wait4 function,
> >> since there's no way to get rusage for the exited process.
> >
> > Reportedly (via Stefan O'Rear just now) there was a similar kernel bug
> > introduced in 161550d74c07303ffa6187ba776f62df5a906a21 that makes
> > wait4 fail to honor pgrp changes that happen while already in the
> > syscall (e.g. performed on the caller by another thread or even
> > another process).
> 
> I could not find the report from Stefan O'Rear.

There's no report yet as far as I know; it came up on #musl when I
mentioned the issue there. I'll inquire about getting it properly
reported. I just wanted to give credit to the original source when
conveying it here.

> Does that result in actual problems for programs or is this a
> theoretical race noticed upon code review?

I noticed it while looking at the glibc rv32 port proposal, where the
emulation caught my eye as something ugly and undesirable we'd have to
do for musl too. As soon as I looked at it in more detail, the race
condition was apparent, and this is exactly the kind of subtle bug we
don't like to reproduct in musl. I'm not aware of any particular
application that would be broken, but it's a cleae violation of the
AS-safety requirements of the interfaces involved.

Regarding resolving it, I would be perfectly happy with enabling wait4
even for archs that don't want time32 syscalls. musl will be
translating the wait4 rusage results to 64-bit time_t anyway, so
there's utterly no advantage to having to do it in a gratuitously
different way on rv32. The only problem with the fields in the kernel
struct rusage being 32-bit is that you can't represent usage of more
than 68 years of cpu time by a process, which is rather
inconsequential -- but if a working extension to the waitid syscall is
provided, we can use that too.

Rich
Rich Felker July 21, 2019, 11:23 p.m. UTC | #15
On Sun, Jul 21, 2019 at 04:40:27PM -0500, Eric W. Biederman wrote:
> Linus Torvalds <torvalds@linux-foundation.org> writes:
> 
> > On Sun, Jul 21, 2019 at 8:45 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> >>
> >> We either add P_PROCESS_PGID to the kernel's waitid or we add wait4.
> >
> > Do we actually need a new flag?
> >
> > Why not just allow people to pass in pid = 0 with the existing P_PGID flag?
> >
> > Those are the traditional and documented waitpid() semantics. Why
> > wouldn't we use those semantics for waitid() too?
> >
> > And since our current (broken) waitid() returns EINVAL for that case,
> > it's even trivial to test for in user space (and fall back on the
> > broken racy code in user space - which you have to do anyway).
> 
> Our current waitid implementation when serving as waitid is not broken.
> 
> What is broken is the waitpid emulation on top of waitid.
> 
> The way waitid is defined waitid P_PGID with a pid of 0 means
> wait for the process group id of 0.  Which in theory and in limited
> practice exists.  AKA that is the pid and process group of the idle
> thread.
> 
> Further there is the outstanding question.  Should P_PROCESS_PGID refer
> to the curent processes process group id at the time of the waitid call,
> or should P_PROCESS_PGID refer to the process group id when a child is
> found?
> 
> It has been suggested elswehere in this conversation that 161550d74c07
> ("pid: sys_wait... fixes") may have introduced a regression.  If so
> the current wait4 behavior of capturing the process group at the time
> of call is wrong and needs to be fixed.
> 
> Not capuring the pid at time time of call is a very different behavior
> of P_PROCESS_PGID vs all of the other waitid cases which do have the pid
> fixed at the time of the call which further argues a separate flag
> because the behavior would be distinctly different.

I believe that is a regression, and that the "capturing it" line of
thinking is misleading. It's thinking in terms of implementation
rather than interface contract. The interface contract covers which
children it can reap and when it can block. If the call remains
blocking, there must be a possible ordering of events, with no
observable effects contradicting that order, by which there are no
reapable children in the process's current pgid. If the call returns
successfully, there must be a possible ordering of events, with no
observable effects contradicting that order, by which the caller's
pgid and the reaped process's pgid were equal.

> > Honestly, that seems like the simplest soilution, but also like the
> > only sane model. The fact that P_PGID with a zero pid doesn't work
> > seems to simply be a bug right now, and keeps waitid() from being a
> > proper superset of waitpid().
> 
> In the context of waitid it does not look sane.  Unlike the other wait
> functions waitid does not have any magic ids and by having an idtype
> field makes that kind of magic unnecessary.  Further it is trivial
> to add one line to the architecture independent enumeration and have 0
> risk of confusion or of regressing user space.

I'm in agreement that if an extension to the waitid syscall is added,
it should be P_PROCESS_PGID, not defining some special case for
pid==0. It's not entirely clear but arguable that the standard
requires EINVAL for P_PGID + pid==0, and plausible that there are
applications that depend on this. We could emulate the EINVAL case in
userspace, but assigning weird semantics to special cases is just a
mess of potential future headaches when it would be trivial to do it
right. And doing it right would also make programming userspace side
easier.

Rich
Florian Weimer July 22, 2019, 3:58 p.m. UTC | #16
* Alistair Francis:

>> For wait/wait3/waitpid, you could probably use the implementations for
>> sysdeps/unix/bsd, layered on top of wait4.
>
> I'm not sure what you mean here.

Once you have wait4 (modulo the discussion about the process group), you
can use

#include <sysdeps/unix/bsd/waitpid.c>

to use the BSD layered implementation.  Also see
sysdeps/unix/sysv/linux/wait3.c.

Thanks,
Florian
Alistair Francis July 22, 2019, 9:02 p.m. UTC | #17
On Mon, Jul 22, 2019 at 8:58 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Alistair Francis:
>
> >> For wait/wait3/waitpid, you could probably use the implementations for
> >> sysdeps/unix/bsd, layered on top of wait4.
> >
> > I'm not sure what you mean here.
>
> Once you have wait4 (modulo the discussion about the process group), you
> can use
>
> #include <sysdeps/unix/bsd/waitpid.c>
>
> to use the BSD layered implementation.  Also see
> sysdeps/unix/sysv/linux/wait3.c.

Yep, if we get a wait4 implementation I can do that. Although if there
is a wait4 implementation for RV32 I can drop this patch altogether as
it isn't required if we have a wait4 syscall.

Alistair

>
> Thanks,
> Florian
Eric W. Biederman July 23, 2019, midnight UTC | #18
Rich Felker <dalias@libc.org> writes:

> On Sun, Jul 21, 2019 at 04:40:27PM -0500, Eric W. Biederman wrote:
>> Linus Torvalds <torvalds@linux-foundation.org> writes:
>> 
>> > On Sun, Jul 21, 2019 at 8:45 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>> >>
>> >> We either add P_PROCESS_PGID to the kernel's waitid or we add wait4.
>> >
>> > Do we actually need a new flag?
>> >
>> > Why not just allow people to pass in pid = 0 with the existing P_PGID flag?
>> >
>> > Those are the traditional and documented waitpid() semantics. Why
>> > wouldn't we use those semantics for waitid() too?
>> >
>> > And since our current (broken) waitid() returns EINVAL for that case,
>> > it's even trivial to test for in user space (and fall back on the
>> > broken racy code in user space - which you have to do anyway).
>> 
>> Our current waitid implementation when serving as waitid is not broken.
>> 
>> What is broken is the waitpid emulation on top of waitid.
>> 
>> The way waitid is defined waitid P_PGID with a pid of 0 means
>> wait for the process group id of 0.  Which in theory and in limited
>> practice exists.  AKA that is the pid and process group of the idle
>> thread.
>> 
>> Further there is the outstanding question.  Should P_PROCESS_PGID refer
>> to the curent processes process group id at the time of the waitid call,
>> or should P_PROCESS_PGID refer to the process group id when a child is
>> found?
>> 
>> It has been suggested elswehere in this conversation that 161550d74c07
>> ("pid: sys_wait... fixes") may have introduced a regression.  If so
>> the current wait4 behavior of capturing the process group at the time
>> of call is wrong and needs to be fixed.
>> 
>> Not capuring the pid at time time of call is a very different behavior
>> of P_PROCESS_PGID vs all of the other waitid cases which do have the pid
>> fixed at the time of the call which further argues a separate flag
>> because the behavior would be distinctly different.
>
> I believe that is a regression, and that the "capturing it" line of
> thinking is misleading. It's thinking in terms of implementation
> rather than interface contract. The interface contract covers which
> children it can reap and when it can block. If the call remains
> blocking, there must be a possible ordering of events, with no
> observable effects contradicting that order, by which there are no
> reapable children in the process's current pgid. If the call returns
> successfully, there must be a possible ordering of events, with no
> observable effects contradicting that order, by which the caller's
> pgid and the reaped process's pgid were equal.

A regression (as we talk about it in the linux kernel) requires there to
be an actual program that has problems because of the change in behavior
of the kernel.  Just it being possible for userspace to observe a
behavior change is not enough for something to be classified as a
regression.

Furthermore thinking about this and looking at it more closely this can
only be an issue in the presence of pthreads.  Except in a very narrow
special case setpgid is restricted to only changing the process group of
the calling process.  Which means that in unix without pthreads setpgid
can not run while a process is in wait, waitpid, or wait4.


I was really hoping we would have a pointer to a program that shows
a problem, or a clarification that someone was just reading the code.
I will wait a bit more but given that the code has worked without bug
reports for 11 years I am going to assume that there are no programs
that have regressed because of my old change.


>> > Honestly, that seems like the simplest soilution, but also like the
>> > only sane model. The fact that P_PGID with a zero pid doesn't work
>> > seems to simply be a bug right now, and keeps waitid() from being a
>> > proper superset of waitpid().
>> 
>> In the context of waitid it does not look sane.  Unlike the other wait
>> functions waitid does not have any magic ids and by having an idtype
>> field makes that kind of magic unnecessary.  Further it is trivial
>> to add one line to the architecture independent enumeration and have 0
>> risk of confusion or of regressing user space.
>
> I'm in agreement that if an extension to the waitid syscall is added,
> it should be P_PROCESS_PGID, not defining some special case for
> pid==0. It's not entirely clear but arguable that the standard
> requires EINVAL for P_PGID + pid==0, and plausible that there are
> applications that depend on this. We could emulate the EINVAL case in
> userspace, but assigning weird semantics to special cases is just a
> mess of potential future headaches when it would be trivial to do it
> right. And doing it right would also make programming userspace side
> easier.

Since this is a POSIX interface and shared between many unices I took
at look at a couple to see what they have done.  If possible it is
important to be able to write programs that work the same across
different unices.

The BSDs implemented wait6 as their common wait that does everything
system call.   It's large contribution is that wait6 will give both
the rusage of the child that exited and it's children separately.
To be the one system call that is a superset of them all wait6 implements
both P_PGID and P_PID with an id of 0 as a request to wait for any
process in the current process group.

The wait6 family also extends the idtype enumeration a bit with
things such as P_UID and P_GID that wait for the effective uid and gid
respectively.

I looked at the code on FreeBSD and they replace the id of 0 with
an the result of getpgid() internal to the system call, before any
waiting happens.   Resulting in the same semantics as current Linux.


I took a quick look at what Illumos does and they have narrowed their
kernel wait interface to just the posix waitid.  There is no rusage
nor is their any special case for waiting for the processes in the
current process group.


I looked at XNU and it still implements both wait4 and waitid as native
interfaces.  The waitid is not extended while their wait4 converts 0 to
-getpgrp() before waiting.



So my recommendation now is to avoid gratuitous incompatibilities.

1) For extending waitid.  Let's use P_PGID and and id of 0 to represent
   the callers process group.  That is compatible with the BSDs, and
   portable programs should not have any problem with it.

   I want to stay away from the BSD extention of P_PID with an id of 0
   meaning wait for the calling process's process group.  I see where
   it comes from but that is confusing.

2) It appears the popular definition of current process group is the
   current process group at the time of the system call.  Currently that
   is Linux, FreeBSD, Illumos, and XNU.  So short of a real program that
   cares, let's adopt that definition in linux.  Along with patches I
   will see about getting the linux manpage updated to clarify that
   point.

Eric
Arnd Bergmann July 23, 2019, 8:12 a.m. UTC | #19
On Tue, Jul 23, 2019 at 2:01 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> Rich Felker <dalias@libc.org> writes:
> > On Sun, Jul 21, 2019 at 04:40:27PM -0500, Eric W. Biederman wrote:
> >> Linus Torvalds <torvalds@linux-foundation.org> writes:
> > I'm in agreement that if an extension to the waitid syscall is added,
> > it should be P_PROCESS_PGID, not defining some special case for
> > pid==0. It's not entirely clear but arguable that the standard
> > requires EINVAL for P_PGID + pid==0, and plausible that there are
> > applications that depend on this. We could emulate the EINVAL case in
> > userspace, but assigning weird semantics to special cases is just a
> > mess of potential future headaches when it would be trivial to do it
> > right. And doing it right would also make programming userspace side
> > easier.
>
> Since this is a POSIX interface and shared between many unices I took
> at look at a couple to see what they have done.  If possible it is
> important to be able to write programs that work the same across
> different unices.
>
> The BSDs implemented wait6 as their common wait that does everything
> system call.   It's large contribution is that wait6 will give both
> the rusage of the child that exited and it's children separately.

Ah right, that was one more thing I forgot on my earlier list. If we end up
creating a new wait4()/waitid() replacement  to replace the timeval
with __kernel_timespec, we may want that to return a wrusage instead
of rusage. Same for Christian's proposed pidfd_wait().

> So my recommendation now is to avoid gratuitous incompatibilities.
>
> 1) For extending waitid.  Let's use P_PGID and and id of 0 to represent
>    the callers process group.  That is compatible with the BSDs, and
>    portable programs should not have any problem with it.
>
>    I want to stay away from the BSD extention of P_PID with an id of 0
>    meaning wait for the calling process's process group.  I see where
>    it comes from but that is confusing.
>
> 2) It appears the popular definition of current process group is the
>    current process group at the time of the system call.  Currently that
>    is Linux, FreeBSD, Illumos, and XNU.  So short of a real program that
>    cares, let's adopt that definition in linux.  Along with patches I
>    will see about getting the linux manpage updated to clarify that
>    point.

That all sounds reasonable to me, thanks for researching the other
implementations!

         Arnd
Christian Brauner July 23, 2019, 8:28 a.m. UTC | #20
On Tue, Jul 23, 2019 at 10:12:07AM +0200, Arnd Bergmann wrote:
> On Tue, Jul 23, 2019 at 2:01 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> > Rich Felker <dalias@libc.org> writes:
> > > On Sun, Jul 21, 2019 at 04:40:27PM -0500, Eric W. Biederman wrote:
> > >> Linus Torvalds <torvalds@linux-foundation.org> writes:
> > > I'm in agreement that if an extension to the waitid syscall is added,
> > > it should be P_PROCESS_PGID, not defining some special case for
> > > pid==0. It's not entirely clear but arguable that the standard
> > > requires EINVAL for P_PGID + pid==0, and plausible that there are
> > > applications that depend on this. We could emulate the EINVAL case in
> > > userspace, but assigning weird semantics to special cases is just a
> > > mess of potential future headaches when it would be trivial to do it
> > > right. And doing it right would also make programming userspace side
> > > easier.
> >
> > Since this is a POSIX interface and shared between many unices I took
> > at look at a couple to see what they have done.  If possible it is
> > important to be able to write programs that work the same across
> > different unices.
> >
> > The BSDs implemented wait6 as their common wait that does everything
> > system call.   It's large contribution is that wait6 will give both
> > the rusage of the child that exited and it's children separately.
> 
> Ah right, that was one more thing I forgot on my earlier list. If we end up
> creating a new wait4()/waitid() replacement  to replace the timeval
> with __kernel_timespec, we may want that to return a wrusage instead
> of rusage. Same for Christian's proposed pidfd_wait().

So I'm in the middle of writing tests for pidfd_wait. And I'm currently
using struct rusage. I would suggest sending out a first version using
struct rusage and then you can point out what you would like see as a
replacement. What do you think, Arnd?

Christian
Arnd Bergmann July 23, 2019, 8:45 a.m. UTC | #21
On Tue, Jul 23, 2019 at 10:29 AM Christian Brauner <christian@brauner.io> wrote:
> On Tue, Jul 23, 2019 at 10:12:07AM +0200, Arnd Bergmann wrote:
> > On Tue, Jul 23, 2019 at 2:01 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> > > The BSDs implemented wait6 as their common wait that does everything
> > > system call.   It's large contribution is that wait6 will give both
> > > the rusage of the child that exited and it's children separately.
> >
> > Ah right, that was one more thing I forgot on my earlier list. If we end up
> > creating a new wait4()/waitid() replacement  to replace the timeval
> > with __kernel_timespec, we may want that to return a wrusage instead
> > of rusage. Same for Christian's proposed pidfd_wait().
>
> So I'm in the middle of writing tests for pidfd_wait. And I'm currently
> using struct rusage. I would suggest sending out a first version using
> struct rusage and then you can point out what you would like see as a
> replacement. What do you think, Arnd?

Sounds good to me, the debate over what rusage to use should not hold
up the review of the rest of that syscall.

       Arnd
Alistair Francis July 25, 2019, 12:04 a.m. UTC | #22
On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de> wrote:
>
> On Tue, Jul 23, 2019 at 10:29 AM Christian Brauner <christian@brauner.io> wrote:
> > On Tue, Jul 23, 2019 at 10:12:07AM +0200, Arnd Bergmann wrote:
> > > On Tue, Jul 23, 2019 at 2:01 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> > > > The BSDs implemented wait6 as their common wait that does everything
> > > > system call.   It's large contribution is that wait6 will give both
> > > > the rusage of the child that exited and it's children separately.
> > >
> > > Ah right, that was one more thing I forgot on my earlier list. If we end up
> > > creating a new wait4()/waitid() replacement  to replace the timeval
> > > with __kernel_timespec, we may want that to return a wrusage instead
> > > of rusage. Same for Christian's proposed pidfd_wait().
> >
> > So I'm in the middle of writing tests for pidfd_wait. And I'm currently
> > using struct rusage. I would suggest sending out a first version using
> > struct rusage and then you can point out what you would like see as a
> > replacement. What do you think, Arnd?
>
> Sounds good to me, the debate over what rusage to use should not hold
> up the review of the rest of that syscall.

I'm unclear what the final decision is here. What is the solution are
we going to have wait4() or add P_PROCESS_PGID to waitid()?

As well as that what is the solution to current implementations? If we
add wait4() then there isn't an issue (and I can drop this patch) but
if we add P_PROCESS_PGID then we will need a way to handle kernels
with waitid() but no P_PROCESS_PGID. Although my new plan is to only
use the waitid syscall if we don't have waitpid or wait4 so it seems
like this will only affect RV32 for the time being.

Alistair

>
>        Arnd
Rich Felker July 25, 2019, 4:40 a.m. UTC | #23
On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis wrote:
> On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de> wrote:
> >
> > On Tue, Jul 23, 2019 at 10:29 AM Christian Brauner <christian@brauner.io> wrote:
> > > On Tue, Jul 23, 2019 at 10:12:07AM +0200, Arnd Bergmann wrote:
> > > > On Tue, Jul 23, 2019 at 2:01 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
> > > > > The BSDs implemented wait6 as their common wait that does everything
> > > > > system call.   It's large contribution is that wait6 will give both
> > > > > the rusage of the child that exited and it's children separately.
> > > >
> > > > Ah right, that was one more thing I forgot on my earlier list. If we end up
> > > > creating a new wait4()/waitid() replacement  to replace the timeval
> > > > with __kernel_timespec, we may want that to return a wrusage instead
> > > > of rusage. Same for Christian's proposed pidfd_wait().
> > >
> > > So I'm in the middle of writing tests for pidfd_wait. And I'm currently
> > > using struct rusage. I would suggest sending out a first version using
> > > struct rusage and then you can point out what you would like see as a
> > > replacement. What do you think, Arnd?
> >
> > Sounds good to me, the debate over what rusage to use should not hold
> > up the review of the rest of that syscall.
> 
> I'm unclear what the final decision is here. What is the solution are
> we going to have wait4() or add P_PROCESS_PGID to waitid()?
> 
> As well as that what is the solution to current implementations? If we
> add wait4() then there isn't an issue (and I can drop this patch) but
> if we add P_PROCESS_PGID then we will need a way to handle kernels
> with waitid() but no P_PROCESS_PGID. Although my new plan is to only
> use the waitid syscall if we don't have waitpid or wait4 so it seems
> like this will only affect RV32 for the time being.

I would really like some indication which solution will be taken,
since it impacts choices that will need to be made in musl very soon.
My favorite outcome would be bringing back wait4 for rv32 (and
no-time32 archs in general) *and* adding P_PROCESS_PGID. In the short
term, just using wait4 would be the simplest and cleanest for us (same
as all other archs, no extra case to deal with), but in the long term
there may be value in having rusage that can represent more than 68
cpu-years spent by a process (seems plausible with large numbers of
cores).

Rich
Arnd Bergmann July 25, 2019, 1:15 p.m. UTC | #24
On Thu, Jul 25, 2019 at 6:40 AM Rich Felker <dalias@libc.org> wrote:
> On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis wrote:
> > On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > >
> > > Sounds good to me, the debate over what rusage to use should not hold
> > > up the review of the rest of that syscall.
> >
> > I'm unclear what the final decision is here. What is the solution are
> > we going to have wait4() or add P_PROCESS_PGID to waitid()?
> >
> > As well as that what is the solution to current implementations? If we
> > add wait4() then there isn't an issue (and I can drop this patch) but
> > if we add P_PROCESS_PGID then we will need a way to handle kernels
> > with waitid() but no P_PROCESS_PGID. Although my new plan is to only
> > use the waitid syscall if we don't have waitpid or wait4 so it seems
> > like this will only affect RV32 for the time being.
>
> I would really like some indication which solution will be taken,
> since it impacts choices that will need to be made in musl very soon.
> My favorite outcome would be bringing back wait4 for rv32 (and
> no-time32 archs in general) *and* adding P_PROCESS_PGID. In the short
> term, just using wait4 would be the simplest and cleanest for us (same
> as all other archs, no extra case to deal with), but in the long term
> there may be value in having rusage that can represent more than 68
> cpu-years spent by a process (seems plausible with large numbers of
> cores).

Based on the feedback from Linus and Eric, the most likely outcome
at the moment seems to be an extension of waitid() to allow
P_PGID with id=0 like BSD does, and not bring back wait4() or
add P_PROCESS_PGID.

So far, I don't think anyone has proposed an actual kernel patch.
I was hoping that Eric would do it, but I could also send it if he's
otherwise busy.

      Arnd
Christian Brauner July 25, 2019, 4:06 p.m. UTC | #25
On Thu, Jul 25, 2019 at 03:15:35PM +0200, Arnd Bergmann wrote:
> On Thu, Jul 25, 2019 at 6:40 AM Rich Felker <dalias@libc.org> wrote:
> > On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis wrote:
> > > On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de> wrote:
> > > >
> > > > Sounds good to me, the debate over what rusage to use should not hold
> > > > up the review of the rest of that syscall.
> > >
> > > I'm unclear what the final decision is here. What is the solution are
> > > we going to have wait4() or add P_PROCESS_PGID to waitid()?
> > >
> > > As well as that what is the solution to current implementations? If we
> > > add wait4() then there isn't an issue (and I can drop this patch) but
> > > if we add P_PROCESS_PGID then we will need a way to handle kernels
> > > with waitid() but no P_PROCESS_PGID. Although my new plan is to only
> > > use the waitid syscall if we don't have waitpid or wait4 so it seems
> > > like this will only affect RV32 for the time being.
> >
> > I would really like some indication which solution will be taken,
> > since it impacts choices that will need to be made in musl very soon.
> > My favorite outcome would be bringing back wait4 for rv32 (and
> > no-time32 archs in general) *and* adding P_PROCESS_PGID. In the short
> > term, just using wait4 would be the simplest and cleanest for us (same
> > as all other archs, no extra case to deal with), but in the long term
> > there may be value in having rusage that can represent more than 68
> > cpu-years spent by a process (seems plausible with large numbers of
> > cores).
> 
> Based on the feedback from Linus and Eric, the most likely outcome
> at the moment seems to be an extension of waitid() to allow
> P_PGID with id=0 like BSD does, and not bring back wait4() or
> add P_PROCESS_PGID.
> 
> So far, I don't think anyone has proposed an actual kernel patch.
> I was hoping that Eric would do it, but I could also send it if he's
> otherwise busy.

I'm touching waitid() right now. I can pick this up and put this in
there too.

Christian
Eric W. Biederman July 25, 2019, 5:14 p.m. UTC | #26
Arnd Bergmann <arnd@arndb.de> writes:

> On Thu, Jul 25, 2019 at 6:40 AM Rich Felker <dalias@libc.org> wrote:
>> On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis wrote:
>> > On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de> wrote:
>> > >
>> > > Sounds good to me, the debate over what rusage to use should not hold
>> > > up the review of the rest of that syscall.
>> >
>> > I'm unclear what the final decision is here. What is the solution are
>> > we going to have wait4() or add P_PROCESS_PGID to waitid()?
>> >
>> > As well as that what is the solution to current implementations? If we
>> > add wait4() then there isn't an issue (and I can drop this patch) but
>> > if we add P_PROCESS_PGID then we will need a way to handle kernels
>> > with waitid() but no P_PROCESS_PGID. Although my new plan is to only
>> > use the waitid syscall if we don't have waitpid or wait4 so it seems
>> > like this will only affect RV32 for the time being.
>>
>> I would really like some indication which solution will be taken,
>> since it impacts choices that will need to be made in musl very soon.
>> My favorite outcome would be bringing back wait4 for rv32 (and
>> no-time32 archs in general) *and* adding P_PROCESS_PGID. In the short
>> term, just using wait4 would be the simplest and cleanest for us (same
>> as all other archs, no extra case to deal with), but in the long term
>> there may be value in having rusage that can represent more than 68
>> cpu-years spent by a process (seems plausible with large numbers of
>> cores).
>
> Based on the feedback from Linus and Eric, the most likely outcome
> at the moment seems to be an extension of waitid() to allow
> P_PGID with id=0 like BSD does, and not bring back wait4() or
> add P_PROCESS_PGID.
>
> So far, I don't think anyone has proposed an actual kernel patch.
> I was hoping that Eric would do it, but I could also send it if he's
> otherwise busy.

So here is what I am looking at.  It still needs to be tested
and the description needs to be improved so that it properly credits
everyone.  However I think I have the major stroeks correct.

From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 23 Jul 2019 07:44:46 -0500
Subject: [PATCH] waitid: Add support for waiting for the current process group

It was recently discovered that the linux version of waitid is not a
superset of the other wait functions because it does not include
support for waiting for the current process group.  This has two
downsides.  An extra system call is needed to get the current process
group, and a signal could come in between the system call that
retrieved the process gorup and the call to waitid that changes the
current process group.

Allow userspace to avoid both of those issues by defining
idtype == P_PGID and id == 0 to mean wait for the caller's process
group at the time of the call.

Arguments can be made for using a different choice of idtype and id
for this case but the BSDs already use this P_PGID and 0 to indicate
waiting for the current process's process group.  So be nice to user
space programmers and don't introduce an unnecessary incompatibility.

Some people have noted that the posix description is that
waitpid will wait for the current process group, and that in
the presence of pthreads that process group can change.  To get
clarity on this issue I looked at XNU, FreeBSD, and Luminos.  All of
those flavors of unix waited for the current process group at the
time of call and as written could not adapt to the process group
changing after the call.

At one point Linux did adapt to the current process group changing but
that stopped in 161550d74c07 ("pid: sys_wait... fixes").  It has been
over 11 years since Linux has that behavior, no programs that fail
with the change in behavior have been reported, and I could not
find any other unix that does this.  So I think it is safe to clarify
the definition of current process group, to current process group
at the time of the wait function.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/exit.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index a75b6a7f458a..3d86930f035e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1577,14 +1577,16 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
 		break;
 	case P_PGID:
 		type = PIDTYPE_PGID;
-		if (upid <= 0)
+		if (upid < 0)
 			return -EINVAL;
+		if (upid == 0)
+			pid = get_pid(task_pgrp(current));
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	if (type < PIDTYPE_MAX)
+	if ((type < PIDTYPE_MAX) && !pid)
 		pid = find_get_pid(upid);
 
 	wo.wo_type	= type;
Christian Brauner July 25, 2019, 5:30 p.m. UTC | #27
On July 25, 2019 7:14:05 PM GMT+02:00, ebiederm@xmission.com wrote:
>Arnd Bergmann <arnd@arndb.de> writes:
>
>> On Thu, Jul 25, 2019 at 6:40 AM Rich Felker <dalias@libc.org> wrote:
>>> On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis wrote:
>>> > On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de>
>wrote:
>>> > >
>>> > > Sounds good to me, the debate over what rusage to use should not
>hold
>>> > > up the review of the rest of that syscall.
>>> >
>>> > I'm unclear what the final decision is here. What is the solution
>are
>>> > we going to have wait4() or add P_PROCESS_PGID to waitid()?
>>> >
>>> > As well as that what is the solution to current implementations?
>If we
>>> > add wait4() then there isn't an issue (and I can drop this patch)
>but
>>> > if we add P_PROCESS_PGID then we will need a way to handle kernels
>>> > with waitid() but no P_PROCESS_PGID. Although my new plan is to
>only
>>> > use the waitid syscall if we don't have waitpid or wait4 so it
>seems
>>> > like this will only affect RV32 for the time being.
>>>
>>> I would really like some indication which solution will be taken,
>>> since it impacts choices that will need to be made in musl very
>soon.
>>> My favorite outcome would be bringing back wait4 for rv32 (and
>>> no-time32 archs in general) *and* adding P_PROCESS_PGID. In the
>short
>>> term, just using wait4 would be the simplest and cleanest for us
>(same
>>> as all other archs, no extra case to deal with), but in the long
>term
>>> there may be value in having rusage that can represent more than 68
>>> cpu-years spent by a process (seems plausible with large numbers of
>>> cores).
>>
>> Based on the feedback from Linus and Eric, the most likely outcome
>> at the moment seems to be an extension of waitid() to allow
>> P_PGID with id=0 like BSD does, and not bring back wait4() or
>> add P_PROCESS_PGID.
>>
>> So far, I don't think anyone has proposed an actual kernel patch.
>> I was hoping that Eric would do it, but I could also send it if he's
>> otherwise busy.
>
>So here is what I am looking at.  It still needs to be tested
>and the description needs to be improved so that it properly credits
>everyone.  However I think I have the major stroeks correct.
>
>From: "Eric W. Biederman" <ebiederm@xmission.com>
>Date: Tue, 23 Jul 2019 07:44:46 -0500
>Subject: [PATCH] waitid: Add support for waiting for the current
>process group
>
>It was recently discovered that the linux version of waitid is not a
>superset of the other wait functions because it does not include
>support for waiting for the current process group.  This has two
>downsides.  An extra system call is needed to get the current process
>group, and a signal could come in between the system call that
>retrieved the process gorup and the call to waitid that changes the
>current process group.
>
>Allow userspace to avoid both of those issues by defining
>idtype == P_PGID and id == 0 to mean wait for the caller's process
>group at the time of the call.
>
>Arguments can be made for using a different choice of idtype and id
>for this case but the BSDs already use this P_PGID and 0 to indicate
>waiting for the current process's process group.  So be nice to user
>space programmers and don't introduce an unnecessary incompatibility.
>
>Some people have noted that the posix description is that
>waitpid will wait for the current process group, and that in
>the presence of pthreads that process group can change.  To get
>clarity on this issue I looked at XNU, FreeBSD, and Luminos.  All of
>those flavors of unix waited for the current process group at the
>time of call and as written could not adapt to the process group
>changing after the call.
>
>At one point Linux did adapt to the current process group changing but
>that stopped in 161550d74c07 ("pid: sys_wait... fixes").  It has been
>over 11 years since Linux has that behavior, no programs that fail
>with the change in behavior have been reported, and I could not
>find any other unix that does this.  So I think it is safe to clarify
>the definition of current process group, to current process group
>at the time of the wait function.
>
>Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>---
> kernel/exit.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
>diff --git a/kernel/exit.c b/kernel/exit.c
>index a75b6a7f458a..3d86930f035e 100644
>--- a/kernel/exit.c
>+++ b/kernel/exit.c
>@@ -1577,14 +1577,16 @@ static long kernel_waitid(int which, pid_t
>upid, struct waitid_info *infop,
> 		break;
> 	case P_PGID:
> 		type = PIDTYPE_PGID;
>-		if (upid <= 0)
>+		if (upid < 0)
> 			return -EINVAL;
>+		if (upid == 0)
>+			pid = get_pid(task_pgrp(current));
> 		break;
> 	default:
> 		return -EINVAL;
> 	}
> 
>-	if (type < PIDTYPE_MAX)
>+	if ((type < PIDTYPE_MAX) && !pid)
> 		pid = find_get_pid(upid);
> 
> 	wo.wo_type	= type;

Eric, mind if I send this out alongside the P_PIDFD patchset and put in a test for it?

Christian
Alistair Francis July 26, 2019, 11:35 p.m. UTC | #28
On Thu, Jul 25, 2019 at 10:14 AM Eric W. Biederman
<ebiederm@xmission.com> wrote:
>
> Arnd Bergmann <arnd@arndb.de> writes:
>
> > On Thu, Jul 25, 2019 at 6:40 AM Rich Felker <dalias@libc.org> wrote:
> >> On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis wrote:
> >> > On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de> wrote:
> >> > >
> >> > > Sounds good to me, the debate over what rusage to use should not hold
> >> > > up the review of the rest of that syscall.
> >> >
> >> > I'm unclear what the final decision is here. What is the solution are
> >> > we going to have wait4() or add P_PROCESS_PGID to waitid()?
> >> >
> >> > As well as that what is the solution to current implementations? If we
> >> > add wait4() then there isn't an issue (and I can drop this patch) but
> >> > if we add P_PROCESS_PGID then we will need a way to handle kernels
> >> > with waitid() but no P_PROCESS_PGID. Although my new plan is to only
> >> > use the waitid syscall if we don't have waitpid or wait4 so it seems
> >> > like this will only affect RV32 for the time being.
> >>
> >> I would really like some indication which solution will be taken,
> >> since it impacts choices that will need to be made in musl very soon.
> >> My favorite outcome would be bringing back wait4 for rv32 (and
> >> no-time32 archs in general) *and* adding P_PROCESS_PGID. In the short
> >> term, just using wait4 would be the simplest and cleanest for us (same
> >> as all other archs, no extra case to deal with), but in the long term
> >> there may be value in having rusage that can represent more than 68
> >> cpu-years spent by a process (seems plausible with large numbers of
> >> cores).
> >
> > Based on the feedback from Linus and Eric, the most likely outcome
> > at the moment seems to be an extension of waitid() to allow
> > P_PGID with id=0 like BSD does, and not bring back wait4() or
> > add P_PROCESS_PGID.
> >
> > So far, I don't think anyone has proposed an actual kernel patch.
> > I was hoping that Eric would do it, but I could also send it if he's
> > otherwise busy.
>
> So here is what I am looking at.  It still needs to be tested
> and the description needs to be improved so that it properly credits
> everyone.  However I think I have the major stroeks correct.
>
> From: "Eric W. Biederman" <ebiederm@xmission.com>
> Date: Tue, 23 Jul 2019 07:44:46 -0500
> Subject: [PATCH] waitid: Add support for waiting for the current process group
>
> It was recently discovered that the linux version of waitid is not a
> superset of the other wait functions because it does not include
> support for waiting for the current process group.  This has two
> downsides.  An extra system call is needed to get the current process
> group, and a signal could come in between the system call that
> retrieved the process gorup and the call to waitid that changes the
> current process group.
>
> Allow userspace to avoid both of those issues by defining
> idtype == P_PGID and id == 0 to mean wait for the caller's process
> group at the time of the call.
>
> Arguments can be made for using a different choice of idtype and id
> for this case but the BSDs already use this P_PGID and 0 to indicate
> waiting for the current process's process group.  So be nice to user
> space programmers and don't introduce an unnecessary incompatibility.
>
> Some people have noted that the posix description is that
> waitpid will wait for the current process group, and that in
> the presence of pthreads that process group can change.  To get
> clarity on this issue I looked at XNU, FreeBSD, and Luminos.  All of
> those flavors of unix waited for the current process group at the
> time of call and as written could not adapt to the process group
> changing after the call.
>
> At one point Linux did adapt to the current process group changing but
> that stopped in 161550d74c07 ("pid: sys_wait... fixes").  It has been
> over 11 years since Linux has that behavior, no programs that fail
> with the change in behavior have been reported, and I could not
> find any other unix that does this.  So I think it is safe to clarify
> the definition of current process group, to current process group
> at the time of the wait function.
>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>

Will this land in 5.3? Also will it be back ported to stable kernels?

Do you mind keeping me in CC when you send the patch?

Alistair

> ---
>  kernel/exit.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/exit.c b/kernel/exit.c
> index a75b6a7f458a..3d86930f035e 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -1577,14 +1577,16 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
>                 break;
>         case P_PGID:
>                 type = PIDTYPE_PGID;
> -               if (upid <= 0)
> +               if (upid < 0)
>                         return -EINVAL;
> +               if (upid == 0)
> +                       pid = get_pid(task_pgrp(current));
>                 break;
>         default:
>                 return -EINVAL;
>         }
>
> -       if (type < PIDTYPE_MAX)
> +       if ((type < PIDTYPE_MAX) && !pid)
>                 pid = find_get_pid(upid);
>
>         wo.wo_type      = type;
> --
> 2.21.0.dirty
>
Alistair Francis Aug. 13, 2019, 10:22 p.m. UTC | #29
On Thu, Jul 25, 2019 at 10:30 AM Christian Brauner <christian@brauner.io> wrote:
>
> On July 25, 2019 7:14:05 PM GMT+02:00, ebiederm@xmission.com wrote:
> >Arnd Bergmann <arnd@arndb.de> writes:
> >
> >> On Thu, Jul 25, 2019 at 6:40 AM Rich Felker <dalias@libc.org> wrote:
> >>> On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis wrote:
> >>> > On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de>
> >wrote:
> >>> > >
> >>> > > Sounds good to me, the debate over what rusage to use should not
> >hold
> >>> > > up the review of the rest of that syscall.
> >>> >
> >>> > I'm unclear what the final decision is here. What is the solution
> >are
> >>> > we going to have wait4() or add P_PROCESS_PGID to waitid()?
> >>> >
> >>> > As well as that what is the solution to current implementations?
> >If we
> >>> > add wait4() then there isn't an issue (and I can drop this patch)
> >but
> >>> > if we add P_PROCESS_PGID then we will need a way to handle kernels
> >>> > with waitid() but no P_PROCESS_PGID. Although my new plan is to
> >only
> >>> > use the waitid syscall if we don't have waitpid or wait4 so it
> >seems
> >>> > like this will only affect RV32 for the time being.
> >>>
> >>> I would really like some indication which solution will be taken,
> >>> since it impacts choices that will need to be made in musl very
> >soon.
> >>> My favorite outcome would be bringing back wait4 for rv32 (and
> >>> no-time32 archs in general) *and* adding P_PROCESS_PGID. In the
> >short
> >>> term, just using wait4 would be the simplest and cleanest for us
> >(same
> >>> as all other archs, no extra case to deal with), but in the long
> >term
> >>> there may be value in having rusage that can represent more than 68
> >>> cpu-years spent by a process (seems plausible with large numbers of
> >>> cores).
> >>
> >> Based on the feedback from Linus and Eric, the most likely outcome
> >> at the moment seems to be an extension of waitid() to allow
> >> P_PGID with id=0 like BSD does, and not bring back wait4() or
> >> add P_PROCESS_PGID.
> >>
> >> So far, I don't think anyone has proposed an actual kernel patch.
> >> I was hoping that Eric would do it, but I could also send it if he's
> >> otherwise busy.
> >
> >So here is what I am looking at.  It still needs to be tested
> >and the description needs to be improved so that it properly credits
> >everyone.  However I think I have the major stroeks correct.
> >
> >From: "Eric W. Biederman" <ebiederm@xmission.com>
> >Date: Tue, 23 Jul 2019 07:44:46 -0500
> >Subject: [PATCH] waitid: Add support for waiting for the current
> >process group
> >
> >It was recently discovered that the linux version of waitid is not a
> >superset of the other wait functions because it does not include
> >support for waiting for the current process group.  This has two
> >downsides.  An extra system call is needed to get the current process
> >group, and a signal could come in between the system call that
> >retrieved the process gorup and the call to waitid that changes the
> >current process group.
> >
> >Allow userspace to avoid both of those issues by defining
> >idtype == P_PGID and id == 0 to mean wait for the caller's process
> >group at the time of the call.
> >
> >Arguments can be made for using a different choice of idtype and id
> >for this case but the BSDs already use this P_PGID and 0 to indicate
> >waiting for the current process's process group.  So be nice to user
> >space programmers and don't introduce an unnecessary incompatibility.
> >
> >Some people have noted that the posix description is that
> >waitpid will wait for the current process group, and that in
> >the presence of pthreads that process group can change.  To get
> >clarity on this issue I looked at XNU, FreeBSD, and Luminos.  All of
> >those flavors of unix waited for the current process group at the
> >time of call and as written could not adapt to the process group
> >changing after the call.
> >
> >At one point Linux did adapt to the current process group changing but
> >that stopped in 161550d74c07 ("pid: sys_wait... fixes").  It has been
> >over 11 years since Linux has that behavior, no programs that fail
> >with the change in behavior have been reported, and I could not
> >find any other unix that does this.  So I think it is safe to clarify
> >the definition of current process group, to current process group
> >at the time of the wait function.
> >
> >Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> >---
> > kernel/exit.c | 6 ++++--
> > 1 file changed, 4 insertions(+), 2 deletions(-)
> >
> >diff --git a/kernel/exit.c b/kernel/exit.c
> >index a75b6a7f458a..3d86930f035e 100644
> >--- a/kernel/exit.c
> >+++ b/kernel/exit.c
> >@@ -1577,14 +1577,16 @@ static long kernel_waitid(int which, pid_t
> >upid, struct waitid_info *infop,
> >               break;
> >       case P_PGID:
> >               type = PIDTYPE_PGID;
> >-              if (upid <= 0)
> >+              if (upid < 0)
> >                       return -EINVAL;
> >+              if (upid == 0)
> >+                      pid = get_pid(task_pgrp(current));
> >               break;
> >       default:
> >               return -EINVAL;
> >       }
> >
> >-      if (type < PIDTYPE_MAX)
> >+      if ((type < PIDTYPE_MAX) && !pid)
> >               pid = find_get_pid(upid);
> >
> >       wo.wo_type      = type;
>
> Eric, mind if I send this out alongside the P_PIDFD patchset and put in a test for it?

Was this ever sent?

Alistair

>
> Christian
Rich Felker Aug. 13, 2019, 11:11 p.m. UTC | #30
On Tue, Aug 13, 2019 at 03:22:02PM -0700, Alistair Francis wrote:
> On Thu, Jul 25, 2019 at 10:30 AM Christian Brauner <christian@brauner.io> wrote:
> >
> > On July 25, 2019 7:14:05 PM GMT+02:00, ebiederm@xmission.com wrote:
> > >Arnd Bergmann <arnd@arndb.de> writes:
> > >
> > >> On Thu, Jul 25, 2019 at 6:40 AM Rich Felker <dalias@libc.org> wrote:
> > >>> On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis wrote:
> > >>> > On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de>
> > >wrote:
> > >>> > >
> > >>> > > Sounds good to me, the debate over what rusage to use should not
> > >hold
> > >>> > > up the review of the rest of that syscall.
> > >>> >
> > >>> > I'm unclear what the final decision is here. What is the solution
> > >are
> > >>> > we going to have wait4() or add P_PROCESS_PGID to waitid()?
> > >>> >
> > >>> > As well as that what is the solution to current implementations?
> > >If we
> > >>> > add wait4() then there isn't an issue (and I can drop this patch)
> > >but
> > >>> > if we add P_PROCESS_PGID then we will need a way to handle kernels
> > >>> > with waitid() but no P_PROCESS_PGID. Although my new plan is to
> > >only
> > >>> > use the waitid syscall if we don't have waitpid or wait4 so it
> > >seems
> > >>> > like this will only affect RV32 for the time being.
> > >>>
> > >>> I would really like some indication which solution will be taken,
> > >>> since it impacts choices that will need to be made in musl very
> > >soon.
> > >>> My favorite outcome would be bringing back wait4 for rv32 (and
> > >>> no-time32 archs in general) *and* adding P_PROCESS_PGID. In the
> > >short
> > >>> term, just using wait4 would be the simplest and cleanest for us
> > >(same
> > >>> as all other archs, no extra case to deal with), but in the long
> > >term
> > >>> there may be value in having rusage that can represent more than 68
> > >>> cpu-years spent by a process (seems plausible with large numbers of
> > >>> cores).
> > >>
> > >> Based on the feedback from Linus and Eric, the most likely outcome
> > >> at the moment seems to be an extension of waitid() to allow
> > >> P_PGID with id=0 like BSD does, and not bring back wait4() or
> > >> add P_PROCESS_PGID.
> > >>
> > >> So far, I don't think anyone has proposed an actual kernel patch.
> > >> I was hoping that Eric would do it, but I could also send it if he's
> > >> otherwise busy.
> > >
> > >So here is what I am looking at.  It still needs to be tested
> > >and the description needs to be improved so that it properly credits
> > >everyone.  However I think I have the major stroeks correct.
> > >
> > >From: "Eric W. Biederman" <ebiederm@xmission.com>
> > >Date: Tue, 23 Jul 2019 07:44:46 -0500
> > >Subject: [PATCH] waitid: Add support for waiting for the current
> > >process group
> > >
> > >It was recently discovered that the linux version of waitid is not a
> > >superset of the other wait functions because it does not include
> > >support for waiting for the current process group.  This has two
> > >downsides.  An extra system call is needed to get the current process
> > >group, and a signal could come in between the system call that
> > >retrieved the process gorup and the call to waitid that changes the
> > >current process group.
> > >
> > >Allow userspace to avoid both of those issues by defining
> > >idtype == P_PGID and id == 0 to mean wait for the caller's process
> > >group at the time of the call.
> > >
> > >Arguments can be made for using a different choice of idtype and id
> > >for this case but the BSDs already use this P_PGID and 0 to indicate
> > >waiting for the current process's process group.  So be nice to user
> > >space programmers and don't introduce an unnecessary incompatibility.
> > >
> > >Some people have noted that the posix description is that
> > >waitpid will wait for the current process group, and that in
> > >the presence of pthreads that process group can change.  To get
> > >clarity on this issue I looked at XNU, FreeBSD, and Luminos.  All of
> > >those flavors of unix waited for the current process group at the
> > >time of call and as written could not adapt to the process group
> > >changing after the call.
> > >
> > >At one point Linux did adapt to the current process group changing but
> > >that stopped in 161550d74c07 ("pid: sys_wait... fixes").  It has been
> > >over 11 years since Linux has that behavior, no programs that fail
> > >with the change in behavior have been reported, and I could not
> > >find any other unix that does this.  So I think it is safe to clarify
> > >the definition of current process group, to current process group
> > >at the time of the wait function.
> > >
> > >Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> > >---
> > > kernel/exit.c | 6 ++++--
> > > 1 file changed, 4 insertions(+), 2 deletions(-)
> > >
> > >diff --git a/kernel/exit.c b/kernel/exit.c
> > >index a75b6a7f458a..3d86930f035e 100644
> > >--- a/kernel/exit.c
> > >+++ b/kernel/exit.c
> > >@@ -1577,14 +1577,16 @@ static long kernel_waitid(int which, pid_t
> > >upid, struct waitid_info *infop,
> > >               break;
> > >       case P_PGID:
> > >               type = PIDTYPE_PGID;
> > >-              if (upid <= 0)
> > >+              if (upid < 0)
> > >                       return -EINVAL;
> > >+              if (upid == 0)
> > >+                      pid = get_pid(task_pgrp(current));
> > >               break;
> > >       default:
> > >               return -EINVAL;
> > >       }
> > >
> > >-      if (type < PIDTYPE_MAX)
> > >+      if ((type < PIDTYPE_MAX) && !pid)
> > >               pid = find_get_pid(upid);
> > >
> > >       wo.wo_type      = type;
> >
> > Eric, mind if I send this out alongside the P_PIDFD patchset and put in a test for it?
> 
> Was this ever sent?

It's not upstream. Would be really nice to get this fixed in 5.3 so
that RV32 will not be missing functionality...

Rich
Christian Brauner Aug. 14, 2019, 5:07 a.m. UTC | #31
On August 14, 2019 1:11:53 AM GMT+02:00, Rich Felker <dalias@libc.org> wrote:
>On Tue, Aug 13, 2019 at 03:22:02PM -0700, Alistair Francis wrote:
>> On Thu, Jul 25, 2019 at 10:30 AM Christian Brauner
><christian@brauner.io> wrote:
>> >
>> > On July 25, 2019 7:14:05 PM GMT+02:00, ebiederm@xmission.com wrote:
>> > >Arnd Bergmann <arnd@arndb.de> writes:
>> > >
>> > >> On Thu, Jul 25, 2019 at 6:40 AM Rich Felker <dalias@libc.org>
>wrote:
>> > >>> On Wed, Jul 24, 2019 at 05:04:53PM -0700, Alistair Francis
>wrote:
>> > >>> > On Tue, Jul 23, 2019 at 1:45 AM Arnd Bergmann <arnd@arndb.de>
>> > >wrote:
>> > >>> > >
>> > >>> > > Sounds good to me, the debate over what rusage to use
>should not
>> > >hold
>> > >>> > > up the review of the rest of that syscall.
>> > >>> >
>> > >>> > I'm unclear what the final decision is here. What is the
>solution
>> > >are
>> > >>> > we going to have wait4() or add P_PROCESS_PGID to waitid()?
>> > >>> >
>> > >>> > As well as that what is the solution to current
>implementations?
>> > >If we
>> > >>> > add wait4() then there isn't an issue (and I can drop this
>patch)
>> > >but
>> > >>> > if we add P_PROCESS_PGID then we will need a way to handle
>kernels
>> > >>> > with waitid() but no P_PROCESS_PGID. Although my new plan is
>to
>> > >only
>> > >>> > use the waitid syscall if we don't have waitpid or wait4 so
>it
>> > >seems
>> > >>> > like this will only affect RV32 for the time being.
>> > >>>
>> > >>> I would really like some indication which solution will be
>taken,
>> > >>> since it impacts choices that will need to be made in musl very
>> > >soon.
>> > >>> My favorite outcome would be bringing back wait4 for rv32 (and
>> > >>> no-time32 archs in general) *and* adding P_PROCESS_PGID. In the
>> > >short
>> > >>> term, just using wait4 would be the simplest and cleanest for
>us
>> > >(same
>> > >>> as all other archs, no extra case to deal with), but in the
>long
>> > >term
>> > >>> there may be value in having rusage that can represent more
>than 68
>> > >>> cpu-years spent by a process (seems plausible with large
>numbers of
>> > >>> cores).
>> > >>
>> > >> Based on the feedback from Linus and Eric, the most likely
>outcome
>> > >> at the moment seems to be an extension of waitid() to allow
>> > >> P_PGID with id=0 like BSD does, and not bring back wait4() or
>> > >> add P_PROCESS_PGID.
>> > >>
>> > >> So far, I don't think anyone has proposed an actual kernel
>patch.
>> > >> I was hoping that Eric would do it, but I could also send it if
>he's
>> > >> otherwise busy.
>> > >
>> > >So here is what I am looking at.  It still needs to be tested
>> > >and the description needs to be improved so that it properly
>credits
>> > >everyone.  However I think I have the major stroeks correct.
>> > >
>> > >From: "Eric W. Biederman" <ebiederm@xmission.com>
>> > >Date: Tue, 23 Jul 2019 07:44:46 -0500
>> > >Subject: [PATCH] waitid: Add support for waiting for the current
>> > >process group
>> > >
>> > >It was recently discovered that the linux version of waitid is not
>a
>> > >superset of the other wait functions because it does not include
>> > >support for waiting for the current process group.  This has two
>> > >downsides.  An extra system call is needed to get the current
>process
>> > >group, and a signal could come in between the system call that
>> > >retrieved the process gorup and the call to waitid that changes
>the
>> > >current process group.
>> > >
>> > >Allow userspace to avoid both of those issues by defining
>> > >idtype == P_PGID and id == 0 to mean wait for the caller's process
>> > >group at the time of the call.
>> > >
>> > >Arguments can be made for using a different choice of idtype and
>id
>> > >for this case but the BSDs already use this P_PGID and 0 to
>indicate
>> > >waiting for the current process's process group.  So be nice to
>user
>> > >space programmers and don't introduce an unnecessary
>incompatibility.
>> > >
>> > >Some people have noted that the posix description is that
>> > >waitpid will wait for the current process group, and that in
>> > >the presence of pthreads that process group can change.  To get
>> > >clarity on this issue I looked at XNU, FreeBSD, and Luminos.  All
>of
>> > >those flavors of unix waited for the current process group at the
>> > >time of call and as written could not adapt to the process group
>> > >changing after the call.
>> > >
>> > >At one point Linux did adapt to the current process group changing
>but
>> > >that stopped in 161550d74c07 ("pid: sys_wait... fixes").  It has
>been
>> > >over 11 years since Linux has that behavior, no programs that fail
>> > >with the change in behavior have been reported, and I could not
>> > >find any other unix that does this.  So I think it is safe to
>clarify
>> > >the definition of current process group, to current process group
>> > >at the time of the wait function.
>> > >
>> > >Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
>> > >---
>> > > kernel/exit.c | 6 ++++--
>> > > 1 file changed, 4 insertions(+), 2 deletions(-)
>> > >
>> > >diff --git a/kernel/exit.c b/kernel/exit.c
>> > >index a75b6a7f458a..3d86930f035e 100644
>> > >--- a/kernel/exit.c
>> > >+++ b/kernel/exit.c
>> > >@@ -1577,14 +1577,16 @@ static long kernel_waitid(int which, pid_t
>> > >upid, struct waitid_info *infop,
>> > >               break;
>> > >       case P_PGID:
>> > >               type = PIDTYPE_PGID;
>> > >-              if (upid <= 0)
>> > >+              if (upid < 0)
>> > >                       return -EINVAL;
>> > >+              if (upid == 0)
>> > >+                      pid = get_pid(task_pgrp(current));
>> > >               break;
>> > >       default:
>> > >               return -EINVAL;
>> > >       }
>> > >
>> > >-      if (type < PIDTYPE_MAX)
>> > >+      if ((type < PIDTYPE_MAX) && !pid)
>> > >               pid = find_get_pid(upid);
>> > >
>> > >       wo.wo_type      = type;
>> >
>> > Eric, mind if I send this out alongside the P_PIDFD patchset and
>put in a test for it?
>> 
>> Was this ever sent?
>
>It's not upstream. Would be really nice to get this fixed in 5.3 so
>that RV32 will not be missing functionality...
>
>Rich

I'll pick it up as is and send out for review later if I hear no objections.

Christian
diff mbox series

Patch

diff --git a/ChangeLog b/ChangeLog
index 9ca390a9c3..fb54a4aa61 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1029,6 +1029,9 @@ 
 	* sysdeps/unix/sysv/linux/nanosleep.c: Likewise.
 	* sysdeps/unix/sysv/linux/nanosleep_nocancel.c: Likewise.
 	* sysdeps/unix/sysv/linux/gettimeofday.c: Use clock_gettime64 syscall for gettimeofday.
+	* sysdeps/unix/sysv/linux/wait.c: Use __NR_waitid if avaliable.
+	* sysdeps/unix/sysv/linux/waitpid.c: Likewise.
+	* sysdeps/unix/sysv/linux/waitpid_nocancel.c: Likewise.
 
 2019-06-20  Dmitry V. Levin  <ldv@altlinux.org>
 	    Florian Weimer  <fweimer@redhat.com>
diff --git a/sysdeps/unix/sysv/linux/wait.c b/sysdeps/unix/sysv/linux/wait.c
index 498bd1c095..e727afa14b 100644
--- a/sysdeps/unix/sysv/linux/wait.c
+++ b/sysdeps/unix/sysv/linux/wait.c
@@ -26,9 +26,42 @@ 
 pid_t
 __libc_wait (int *stat_loc)
 {
-  pid_t result = SYSCALL_CANCEL (wait4, WAIT_ANY, stat_loc, 0,
-				 (struct rusage *) NULL);
-  return result;
+#ifdef __NR_waitid
+  siginfo_t infop;
+  __pid_t ret;
+
+  ret = SYSCALL_CANCEL (waitid, P_ALL, 0, &infop, WEXITED, NULL);
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (stat_loc) {
+    *stat_loc = 0;
+    switch (infop.si_code) {
+    case CLD_EXITED:
+      *stat_loc = infop.si_status << 8;
+      break;
+    case CLD_DUMPED:
+      *stat_loc = 0x80;
+    case CLD_KILLED:
+      *stat_loc |= infop.si_status;
+      break;
+    case CLD_TRAPPED:
+    case CLD_STOPPED:
+      *stat_loc = infop.si_status << 8 | 0x7f;
+      break;
+    case CLD_CONTINUED:
+      *stat_loc = 0xffff;
+      break;
+    }
+  }
+
+  return infop.si_pid;
+#else
+  return SYSCALL_CANCEL (wait4, WAIT_ANY, stat_loc, 0,
+                         (struct rusage *) NULL);
+#endif
 }
 
 weak_alias (__libc_wait, __wait)
diff --git a/sysdeps/unix/sysv/linux/waitpid.c b/sysdeps/unix/sysv/linux/waitpid.c
index f0897574c0..7d4e0bb77d 100644
--- a/sysdeps/unix/sysv/linux/waitpid.c
+++ b/sysdeps/unix/sysv/linux/waitpid.c
@@ -20,12 +20,58 @@ 
 #include <sysdep-cancel.h>
 #include <stdlib.h>
 #include <sys/wait.h>
+#include <unistd.h>
 
 __pid_t
 __waitpid (__pid_t pid, int *stat_loc, int options)
 {
 #ifdef __NR_waitpid
   return SYSCALL_CANCEL (waitpid, pid, stat_loc, options);
+#elif defined(__NR_waitid)
+  __pid_t ret;
+  idtype_t idtype = P_PID;
+  siginfo_t infop;
+
+  if (pid < -1) {
+    idtype = P_PGID;
+    pid *= -1;
+  } else if (pid == -1) {
+    idtype = P_ALL;
+  } else if (pid == 0) {
+    idtype = P_PGID;
+    pid = getpgrp();
+  }
+
+  options |= WEXITED;
+
+  ret = SYSCALL_CANCEL (waitid, idtype, pid, &infop, options, NULL);
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (stat_loc) {
+    *stat_loc = 0;
+    switch (infop.si_code) {
+    case CLD_EXITED:
+      *stat_loc = infop.si_status << 8;
+      break;
+    case CLD_DUMPED:
+      *stat_loc = 0x80;
+    case CLD_KILLED:
+      *stat_loc |= infop.si_status;
+      break;
+    case CLD_TRAPPED:
+    case CLD_STOPPED:
+      *stat_loc = infop.si_status << 8 | 0x7f;
+      break;
+    case CLD_CONTINUED:
+      *stat_loc = 0xffff;
+      break;
+    }
+  }
+
+  return infop.si_pid;
 #else
   return SYSCALL_CANCEL (wait4, pid, stat_loc, options, NULL);
 #endif
diff --git a/sysdeps/unix/sysv/linux/waitpid_nocancel.c b/sysdeps/unix/sysv/linux/waitpid_nocancel.c
index 89e36a5c0b..c92f05421b 100644
--- a/sysdeps/unix/sysv/linux/waitpid_nocancel.c
+++ b/sysdeps/unix/sysv/linux/waitpid_nocancel.c
@@ -27,6 +27,51 @@  __waitpid_nocancel (__pid_t pid, int *stat_loc, int options)
 {
 #ifdef __NR_waitpid
   return INLINE_SYSCALL_CALL (waitpid, pid, stat_loc, options);
+#elif defined(__NR_waitid)
+  __pid_t ret;
+  idtype_t idtype = P_PID;
+  siginfo_t infop;
+
+  if (pid < -1) {
+    idtype = P_PGID;
+    pid *= -1;
+  } else if (pid == -1) {
+    idtype = P_ALL;
+  } else if (pid == 0) {
+    idtype = P_PGID;
+    pid = getpgrp();
+  }
+
+  options |= WEXITED;
+
+  ret = INLINE_SYSCALL_CALL (waitid, idtype, pid, &infop, options, NULL);
+
+  if (ret < 0) {
+    return ret;
+  }
+
+  if (stat_loc) {
+    *stat_loc = 0;
+    switch (infop.si_code) {
+    case CLD_EXITED:
+      *stat_loc = infop.si_status << 8;
+      break;
+    case CLD_DUMPED:
+      *stat_loc = 0x80;
+    case CLD_KILLED:
+      *stat_loc |= infop.si_status;
+      break;
+    case CLD_TRAPPED:
+    case CLD_STOPPED:
+      *stat_loc = infop.si_status << 8 | 0x7f;
+      break;
+    case CLD_CONTINUED:
+      *stat_loc = 0xffff;
+      break;
+    }
+  }
+
+  return infop.si_pid;
 #else
   return INLINE_SYSCALL_CALL (wait4, pid, stat_loc, options, NULL);
 #endif