diff mbox series

__check_pf: Add a cancellation cleanup handler [BZ #20975]

Message ID 20230427200615.1496059-1-hjl.tools@gmail.com
State New
Headers show
Series __check_pf: Add a cancellation cleanup handler [BZ #20975] | expand

Commit Message

H.J. Lu April 27, 2023, 8:06 p.m. UTC
There are reports for hang in __check_pf:

https://github.com/JoeDog/siege/issues/4

It is reproducible only under specific configurations:

1. Large number of cores (>= 64) and large number of threads (> 3X of
the number of cores) with long lived socket connection.
2. Low power (frequency) mode.
3. Power management is enabled.

While holding lock, __check_pf calls make_request which calls __sendto
and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
lock held by __check_pf won't be released and can cause deadlock when
thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
cleanup handler for __check_pf to unlock the lock when cancelled by
another thread.  This fixes BZ #20975 and the siege hang issue.
---
 sysdeps/unix/sysv/linux/Makefile   |  2 ++
 sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

Comments

Noah Goldstein April 27, 2023, 8:42 p.m. UTC | #1
On Thu, Apr 27, 2023 at 3:06 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> There are reports for hang in __check_pf:
>
> https://github.com/JoeDog/siege/issues/4
>
> It is reproducible only under specific configurations:
>
> 1. Large number of cores (>= 64) and large number of threads (> 3X of
> the number of cores) with long lived socket connection.
> 2. Low power (frequency) mode.
> 3. Power management is enabled.
>
> While holding lock, __check_pf calls make_request which calls __sendto
> and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> lock held by __check_pf won't be released and can cause deadlock when
> thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> cleanup handler for __check_pf to unlock the lock when cancelled by
> another thread.  This fixes BZ #20975 and the siege hang issue.
> ---
>  sysdeps/unix/sysv/linux/Makefile   |  2 ++
>  sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
>  2 files changed, 17 insertions(+)
>
> diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> index aec7a94785..0160be8790 100644
> --- a/sysdeps/unix/sysv/linux/Makefile
> +++ b/sysdeps/unix/sysv/linux/Makefile
> @@ -529,6 +529,8 @@ sysdep_headers += \
>  sysdep_routines += \
>    netlink_assert_response \
>    # sysdep_routines
> +
> +CFLAGS-check_pf.c += -fexceptions
>  endif
>
>  # Don't compile the ctype glue code, since there is no old non-GNU C library.
> diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
> index b157c5126c..2b0b8b6368 100644
> --- a/sysdeps/unix/sysv/linux/check_pf.c
> +++ b/sysdeps/unix/sysv/linux/check_pf.c
> @@ -292,6 +292,14 @@ make_request (int fd, pid_t pid)
>    return NULL;
>  }
>
> +#ifdef __EXCEPTIONS
> +static void
> +cancel_handler (void *arg __attribute__((unused)))
> +{
> +  /* Release the lock.  */
> +  __libc_lock_unlock (lock);
> +}
> +#endif
>
>  void
>  attribute_hidden
> @@ -304,6 +312,10 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
>    struct cached_data *olddata = NULL;
>    struct cached_data *data = NULL;
>
> +#ifdef __EXCEPTIONS
> +  /* Make sure that lock is released when the thread is cancelled.  */
> +  __libc_cleanup_push (cancel_handler, NULL);
> +#endif
>    __libc_lock_lock (lock);
>
>    if (cache_valid_p ())
> @@ -338,6 +350,9 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
>         }
>      }
>
> +#ifdef __EXCEPTIONS
> +  __libc_cleanup_pop (0);
> +#endif
>    __libc_lock_unlock (lock);
>
>    if (data != NULL)
> --
> 2.40.0
>

Can we add a test the will just XFAIL (or XPASS) if cores < 64?
H.J. Lu April 27, 2023, 8:59 p.m. UTC | #2
On Thu, Apr 27, 2023 at 1:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 27, 2023 at 3:06 PM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > There are reports for hang in __check_pf:
> >
> > https://github.com/JoeDog/siege/issues/4
> >
> > It is reproducible only under specific configurations:
> >
> > 1. Large number of cores (>= 64) and large number of threads (> 3X of
> > the number of cores) with long lived socket connection.
> > 2. Low power (frequency) mode.
> > 3. Power management is enabled.
> >
> > While holding lock, __check_pf calls make_request which calls __sendto
> > and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> > lock held by __check_pf won't be released and can cause deadlock when
> > thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> > cleanup handler for __check_pf to unlock the lock when cancelled by
> > another thread.  This fixes BZ #20975 and the siege hang issue.
> > ---
> >  sysdeps/unix/sysv/linux/Makefile   |  2 ++
> >  sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
> >  2 files changed, 17 insertions(+)
> >
> > diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> > index aec7a94785..0160be8790 100644
> > --- a/sysdeps/unix/sysv/linux/Makefile
> > +++ b/sysdeps/unix/sysv/linux/Makefile
> > @@ -529,6 +529,8 @@ sysdep_headers += \
> >  sysdep_routines += \
> >    netlink_assert_response \
> >    # sysdep_routines
> > +
> > +CFLAGS-check_pf.c += -fexceptions
> >  endif
> >
> >  # Don't compile the ctype glue code, since there is no old non-GNU C library.
> > diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
> > index b157c5126c..2b0b8b6368 100644
> > --- a/sysdeps/unix/sysv/linux/check_pf.c
> > +++ b/sysdeps/unix/sysv/linux/check_pf.c
> > @@ -292,6 +292,14 @@ make_request (int fd, pid_t pid)
> >    return NULL;
> >  }
> >
> > +#ifdef __EXCEPTIONS
> > +static void
> > +cancel_handler (void *arg __attribute__((unused)))
> > +{
> > +  /* Release the lock.  */
> > +  __libc_lock_unlock (lock);
> > +}
> > +#endif
> >
> >  void
> >  attribute_hidden
> > @@ -304,6 +312,10 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> >    struct cached_data *olddata = NULL;
> >    struct cached_data *data = NULL;
> >
> > +#ifdef __EXCEPTIONS
> > +  /* Make sure that lock is released when the thread is cancelled.  */
> > +  __libc_cleanup_push (cancel_handler, NULL);
> > +#endif
> >    __libc_lock_lock (lock);
> >
> >    if (cache_valid_p ())
> > @@ -338,6 +350,9 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> >         }
> >      }
> >
> > +#ifdef __EXCEPTIONS
> > +  __libc_cleanup_pop (0);
> > +#endif
> >    __libc_lock_unlock (lock);
> >
> >    if (data != NULL)
> > --
> > 2.40.0
> >
>
> Can we add a test the will just XFAIL (or XPASS) if cores < 64?

There is no simple test for this.  This is one reason why it hasn't been fixed
since 2016.
Noah Goldstein April 27, 2023, 9:44 p.m. UTC | #3
On Thu, Apr 27, 2023 at 4:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 27, 2023 at 1:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Apr 27, 2023 at 3:06 PM H.J. Lu via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > There are reports for hang in __check_pf:
> > >
> > > https://github.com/JoeDog/siege/issues/4
> > >
> > > It is reproducible only under specific configurations:
> > >
> > > 1. Large number of cores (>= 64) and large number of threads (> 3X of
> > > the number of cores) with long lived socket connection.
> > > 2. Low power (frequency) mode.
> > > 3. Power management is enabled.
> > >
> > > While holding lock, __check_pf calls make_request which calls __sendto
> > > and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> > > lock held by __check_pf won't be released and can cause deadlock when
> > > thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> > > cleanup handler for __check_pf to unlock the lock when cancelled by
> > > another thread.  This fixes BZ #20975 and the siege hang issue.
> > > ---
> > >  sysdeps/unix/sysv/linux/Makefile   |  2 ++
> > >  sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
> > >  2 files changed, 17 insertions(+)
> > >
> > > diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> > > index aec7a94785..0160be8790 100644
> > > --- a/sysdeps/unix/sysv/linux/Makefile
> > > +++ b/sysdeps/unix/sysv/linux/Makefile
> > > @@ -529,6 +529,8 @@ sysdep_headers += \
> > >  sysdep_routines += \
> > >    netlink_assert_response \
> > >    # sysdep_routines
> > > +
> > > +CFLAGS-check_pf.c += -fexceptions
> > >  endif
> > >
> > >  # Don't compile the ctype glue code, since there is no old non-GNU C library.
> > > diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
> > > index b157c5126c..2b0b8b6368 100644
> > > --- a/sysdeps/unix/sysv/linux/check_pf.c
> > > +++ b/sysdeps/unix/sysv/linux/check_pf.c
> > > @@ -292,6 +292,14 @@ make_request (int fd, pid_t pid)
> > >    return NULL;
> > >  }
> > >
> > > +#ifdef __EXCEPTIONS
> > > +static void
> > > +cancel_handler (void *arg __attribute__((unused)))
> > > +{
> > > +  /* Release the lock.  */
> > > +  __libc_lock_unlock (lock);
> > > +}
> > > +#endif
> > >
> > >  void
> > >  attribute_hidden
> > > @@ -304,6 +312,10 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> > >    struct cached_data *olddata = NULL;
> > >    struct cached_data *data = NULL;
> > >
> > > +#ifdef __EXCEPTIONS
> > > +  /* Make sure that lock is released when the thread is cancelled.  */
> > > +  __libc_cleanup_push (cancel_handler, NULL);
> > > +#endif
> > >    __libc_lock_lock (lock);
> > >
> > >    if (cache_valid_p ())
> > > @@ -338,6 +350,9 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> > >         }
> > >      }
> > >
> > > +#ifdef __EXCEPTIONS
> > > +  __libc_cleanup_pop (0);
> > > +#endif
> > >    __libc_lock_unlock (lock);
> > >
> > >    if (data != NULL)
> > > --
> > > 2.40.0
> > >
> >
> > Can we add a test the will just XFAIL (or XPASS) if cores < 64?
>
> There is no simple test for this.  This is one reason why it hasn't been fixed
> since 2016.
>

Did you verify this fixes issue in siege?
> --
> H.J.
H.J. Lu April 27, 2023, 9:50 p.m. UTC | #4
On Thu, Apr 27, 2023 at 2:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 27, 2023 at 4:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Thu, Apr 27, 2023 at 1:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Thu, Apr 27, 2023 at 3:06 PM H.J. Lu via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > There are reports for hang in __check_pf:
> > > >
> > > > https://github.com/JoeDog/siege/issues/4
> > > >
> > > > It is reproducible only under specific configurations:
> > > >
> > > > 1. Large number of cores (>= 64) and large number of threads (> 3X of
> > > > the number of cores) with long lived socket connection.
> > > > 2. Low power (frequency) mode.
> > > > 3. Power management is enabled.
> > > >
> > > > While holding lock, __check_pf calls make_request which calls __sendto
> > > > and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> > > > lock held by __check_pf won't be released and can cause deadlock when
> > > > thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> > > > cleanup handler for __check_pf to unlock the lock when cancelled by
> > > > another thread.  This fixes BZ #20975 and the siege hang issue.
> > > > ---
> > > >  sysdeps/unix/sysv/linux/Makefile   |  2 ++
> > > >  sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
> > > >  2 files changed, 17 insertions(+)
> > > >
> > > > diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> > > > index aec7a94785..0160be8790 100644
> > > > --- a/sysdeps/unix/sysv/linux/Makefile
> > > > +++ b/sysdeps/unix/sysv/linux/Makefile
> > > > @@ -529,6 +529,8 @@ sysdep_headers += \
> > > >  sysdep_routines += \
> > > >    netlink_assert_response \
> > > >    # sysdep_routines
> > > > +
> > > > +CFLAGS-check_pf.c += -fexceptions
> > > >  endif
> > > >
> > > >  # Don't compile the ctype glue code, since there is no old non-GNU C library.
> > > > diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
> > > > index b157c5126c..2b0b8b6368 100644
> > > > --- a/sysdeps/unix/sysv/linux/check_pf.c
> > > > +++ b/sysdeps/unix/sysv/linux/check_pf.c
> > > > @@ -292,6 +292,14 @@ make_request (int fd, pid_t pid)
> > > >    return NULL;
> > > >  }
> > > >
> > > > +#ifdef __EXCEPTIONS
> > > > +static void
> > > > +cancel_handler (void *arg __attribute__((unused)))
> > > > +{
> > > > +  /* Release the lock.  */
> > > > +  __libc_lock_unlock (lock);
> > > > +}
> > > > +#endif
> > > >
> > > >  void
> > > >  attribute_hidden
> > > > @@ -304,6 +312,10 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> > > >    struct cached_data *olddata = NULL;
> > > >    struct cached_data *data = NULL;
> > > >
> > > > +#ifdef __EXCEPTIONS
> > > > +  /* Make sure that lock is released when the thread is cancelled.  */
> > > > +  __libc_cleanup_push (cancel_handler, NULL);
> > > > +#endif
> > > >    __libc_lock_lock (lock);
> > > >
> > > >    if (cache_valid_p ())
> > > > @@ -338,6 +350,9 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> > > >         }
> > > >      }
> > > >
> > > > +#ifdef __EXCEPTIONS
> > > > +  __libc_cleanup_pop (0);
> > > > +#endif
> > > >    __libc_lock_unlock (lock);
> > > >
> > > >    if (data != NULL)
> > > > --
> > > > 2.40.0
> > > >
> > >
> > > Can we add a test the will just XFAIL (or XPASS) if cores < 64?
> >
> > There is no simple test for this.  This is one reason why it hasn't been fixed
> > since 2016.
> >
>
> Did you verify this fixes issue in siege?
> > --
> > H.J.

Yes.
Noah Goldstein April 27, 2023, 10:03 p.m. UTC | #5
On Thu, Apr 27, 2023 at 3:06 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> There are reports for hang in __check_pf:
>
> https://github.com/JoeDog/siege/issues/4
>
> It is reproducible only under specific configurations:
>
> 1. Large number of cores (>= 64) and large number of threads (> 3X of
> the number of cores) with long lived socket connection.
> 2. Low power (frequency) mode.
> 3. Power management is enabled.
>
> While holding lock, __check_pf calls make_request which calls __sendto
> and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> lock held by __check_pf won't be released and can cause deadlock when
> thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> cleanup handler for __check_pf to unlock the lock when cancelled by
> another thread.  This fixes BZ #20975 and the siege hang issue.
> ---
>  sysdeps/unix/sysv/linux/Makefile   |  2 ++
>  sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
>  2 files changed, 17 insertions(+)
>
> diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> index aec7a94785..0160be8790 100644
> --- a/sysdeps/unix/sysv/linux/Makefile
> +++ b/sysdeps/unix/sysv/linux/Makefile
> @@ -529,6 +529,8 @@ sysdep_headers += \
>  sysdep_routines += \
>    netlink_assert_response \
>    # sysdep_routines
> +
> +CFLAGS-check_pf.c += -fexceptions
>  endif
>
>  # Don't compile the ctype glue code, since there is no old non-GNU C library.
> diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
> index b157c5126c..2b0b8b6368 100644
> --- a/sysdeps/unix/sysv/linux/check_pf.c
> +++ b/sysdeps/unix/sysv/linux/check_pf.c
> @@ -292,6 +292,14 @@ make_request (int fd, pid_t pid)
>    return NULL;
>  }
>
> +#ifdef __EXCEPTIONS
> +static void
> +cancel_handler (void *arg __attribute__((unused)))
> +{
> +  /* Release the lock.  */
> +  __libc_lock_unlock (lock);
> +}
> +#endif
>
>  void
>  attribute_hidden
> @@ -304,6 +312,10 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
>    struct cached_data *olddata = NULL;
>    struct cached_data *data = NULL;
>
> +#ifdef __EXCEPTIONS
> +  /* Make sure that lock is released when the thread is cancelled.  */
> +  __libc_cleanup_push (cancel_handler, NULL);
> +#endif

Should we wait until we have successfully taken the lock to push this?

>    __libc_lock_lock (lock);
>
>    if (cache_valid_p ())
> @@ -338,6 +350,9 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
>         }
>      }
>
> +#ifdef __EXCEPTIONS
> +  __libc_cleanup_pop (0);
> +#endif
>    __libc_lock_unlock (lock);
>
>    if (data != NULL)
> --
> 2.40.0
>
H.J. Lu April 27, 2023, 10:10 p.m. UTC | #6
On Thu, Apr 27, 2023 at 3:03 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 27, 2023 at 3:06 PM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > There are reports for hang in __check_pf:
> >
> > https://github.com/JoeDog/siege/issues/4
> >
> > It is reproducible only under specific configurations:
> >
> > 1. Large number of cores (>= 64) and large number of threads (> 3X of
> > the number of cores) with long lived socket connection.
> > 2. Low power (frequency) mode.
> > 3. Power management is enabled.
> >
> > While holding lock, __check_pf calls make_request which calls __sendto
> > and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> > lock held by __check_pf won't be released and can cause deadlock when
> > thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> > cleanup handler for __check_pf to unlock the lock when cancelled by
> > another thread.  This fixes BZ #20975 and the siege hang issue.
> > ---
> >  sysdeps/unix/sysv/linux/Makefile   |  2 ++
> >  sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
> >  2 files changed, 17 insertions(+)
> >
> > diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> > index aec7a94785..0160be8790 100644
> > --- a/sysdeps/unix/sysv/linux/Makefile
> > +++ b/sysdeps/unix/sysv/linux/Makefile
> > @@ -529,6 +529,8 @@ sysdep_headers += \
> >  sysdep_routines += \
> >    netlink_assert_response \
> >    # sysdep_routines
> > +
> > +CFLAGS-check_pf.c += -fexceptions
> >  endif
> >
> >  # Don't compile the ctype glue code, since there is no old non-GNU C library.
> > diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
> > index b157c5126c..2b0b8b6368 100644
> > --- a/sysdeps/unix/sysv/linux/check_pf.c
> > +++ b/sysdeps/unix/sysv/linux/check_pf.c
> > @@ -292,6 +292,14 @@ make_request (int fd, pid_t pid)
> >    return NULL;
> >  }
> >
> > +#ifdef __EXCEPTIONS
> > +static void
> > +cancel_handler (void *arg __attribute__((unused)))
> > +{
> > +  /* Release the lock.  */
> > +  __libc_lock_unlock (lock);
> > +}
> > +#endif
> >
> >  void
> >  attribute_hidden
> > @@ -304,6 +312,10 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> >    struct cached_data *olddata = NULL;
> >    struct cached_data *data = NULL;
> >
> > +#ifdef __EXCEPTIONS
> > +  /* Make sure that lock is released when the thread is cancelled.  */
> > +  __libc_cleanup_push (cancel_handler, NULL);
> > +#endif
>
> Should we wait until we have successfully taken the lock to push this?

The cleanup handler is called only when thread cancellation happens
in __sendto or __recvmsg called from make_request after lock has
been taken.   The order here isn't critical.

>
> >    __libc_lock_lock (lock);
> >
> >    if (cache_valid_p ())
> > @@ -338,6 +350,9 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> >         }
> >      }
> >
> > +#ifdef __EXCEPTIONS
> > +  __libc_cleanup_pop (0);
> > +#endif
> >    __libc_lock_unlock (lock);
> >
> >    if (data != NULL)
> > --
> > 2.40.0
> >
Florian Weimer April 28, 2023, 8:38 a.m. UTC | #7
* H. J. Lu via Libc-alpha:

> There are reports for hang in __check_pf:
>
> https://github.com/JoeDog/siege/issues/4
>
> It is reproducible only under specific configurations:
>
> 1. Large number of cores (>= 64) and large number of threads (> 3X of
> the number of cores) with long lived socket connection.
> 2. Low power (frequency) mode.
> 3. Power management is enabled.
>
> While holding lock, __check_pf calls make_request which calls __sendto
> and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> lock held by __check_pf won't be released and can cause deadlock when
> thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> cleanup handler for __check_pf to unlock the lock when cancelled by
> another thread.  This fixes BZ #20975 and the siege hang issue.

It's probably easier to reproduce if the system has many network
interfaces with lots of addresses.

Doesn't getaddrinfo leak all kind of resources when canceled?  That's
more difficult to fix, though.

Thanks,
Florian
H.J. Lu April 28, 2023, 3:33 p.m. UTC | #8
On Fri, Apr 28, 2023 at 1:38 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu via Libc-alpha:
>
> > There are reports for hang in __check_pf:
> >
> > https://github.com/JoeDog/siege/issues/4
> >
> > It is reproducible only under specific configurations:
> >
> > 1. Large number of cores (>= 64) and large number of threads (> 3X of
> > the number of cores) with long lived socket connection.
> > 2. Low power (frequency) mode.
> > 3. Power management is enabled.
> >
> > While holding lock, __check_pf calls make_request which calls __sendto
> > and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> > lock held by __check_pf won't be released and can cause deadlock when
> > thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> > cleanup handler for __check_pf to unlock the lock when cancelled by
> > another thread.  This fixes BZ #20975 and the siege hang issue.
>
> It's probably easier to reproduce if the system has many network
> interfaces with lots of addresses.
>
> Doesn't getaddrinfo leak all kind of resources when canceled?  That's
> more difficult to fix, though.

We will work on it if there are reports.
Noah Goldstein April 28, 2023, 7:17 p.m. UTC | #9
On Thu, Apr 27, 2023 at 3:06 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> There are reports for hang in __check_pf:
>
> https://github.com/JoeDog/siege/issues/4
>
> It is reproducible only under specific configurations:
>
> 1. Large number of cores (>= 64) and large number of threads (> 3X of
> the number of cores) with long lived socket connection.
> 2. Low power (frequency) mode.
> 3. Power management is enabled.
>
> While holding lock, __check_pf calls make_request which calls __sendto
> and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> lock held by __check_pf won't be released and can cause deadlock when
> thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> cleanup handler for __check_pf to unlock the lock when cancelled by
> another thread.  This fixes BZ #20975 and the siege hang issue.
> ---
>  sysdeps/unix/sysv/linux/Makefile   |  2 ++
>  sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
>  2 files changed, 17 insertions(+)
>
> diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> index aec7a94785..0160be8790 100644
> --- a/sysdeps/unix/sysv/linux/Makefile
> +++ b/sysdeps/unix/sysv/linux/Makefile
> @@ -529,6 +529,8 @@ sysdep_headers += \
>  sysdep_routines += \
>    netlink_assert_response \
>    # sysdep_routines
> +
> +CFLAGS-check_pf.c += -fexceptions
>  endif
>
>  # Don't compile the ctype glue code, since there is no old non-GNU C library.
> diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
> index b157c5126c..2b0b8b6368 100644
> --- a/sysdeps/unix/sysv/linux/check_pf.c
> +++ b/sysdeps/unix/sysv/linux/check_pf.c
> @@ -292,6 +292,14 @@ make_request (int fd, pid_t pid)
>    return NULL;
>  }
>
> +#ifdef __EXCEPTIONS
> +static void
> +cancel_handler (void *arg __attribute__((unused)))
> +{
> +  /* Release the lock.  */
> +  __libc_lock_unlock (lock);
> +}
> +#endif
>
>  void
>  attribute_hidden
> @@ -304,6 +312,10 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
>    struct cached_data *olddata = NULL;
>    struct cached_data *data = NULL;
>
> +#ifdef __EXCEPTIONS
> +  /* Make sure that lock is released when the thread is cancelled.  */
> +  __libc_cleanup_push (cancel_handler, NULL);
> +#endif
>    __libc_lock_lock (lock);
>
>    if (cache_valid_p ())
> @@ -338,6 +350,9 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
>         }
>      }
>
> +#ifdef __EXCEPTIONS
> +  __libc_cleanup_pop (0);
> +#endif
>    __libc_lock_unlock (lock);
>
>    if (data != NULL)
> --
> 2.40.0
>

LGTM.
H.J. Lu May 22, 2023, 8:37 p.m. UTC | #10
On Fri, Apr 28, 2023 at 12:17 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 27, 2023 at 3:06 PM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > There are reports for hang in __check_pf:
> >
> > https://github.com/JoeDog/siege/issues/4
> >
> > It is reproducible only under specific configurations:
> >
> > 1. Large number of cores (>= 64) and large number of threads (> 3X of
> > the number of cores) with long lived socket connection.
> > 2. Low power (frequency) mode.
> > 3. Power management is enabled.
> >
> > While holding lock, __check_pf calls make_request which calls __sendto
> > and __recvmsg.  Since __sendto and __recvmsg are cancellation points,
> > lock held by __check_pf won't be released and can cause deadlock when
> > thread cancellation happens in __sendto or __recvmsg.  Add a cancellation
> > cleanup handler for __check_pf to unlock the lock when cancelled by
> > another thread.  This fixes BZ #20975 and the siege hang issue.
> > ---
> >  sysdeps/unix/sysv/linux/Makefile   |  2 ++
> >  sysdeps/unix/sysv/linux/check_pf.c | 15 +++++++++++++++
> >  2 files changed, 17 insertions(+)
> >
> > diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
> > index aec7a94785..0160be8790 100644
> > --- a/sysdeps/unix/sysv/linux/Makefile
> > +++ b/sysdeps/unix/sysv/linux/Makefile
> > @@ -529,6 +529,8 @@ sysdep_headers += \
> >  sysdep_routines += \
> >    netlink_assert_response \
> >    # sysdep_routines
> > +
> > +CFLAGS-check_pf.c += -fexceptions
> >  endif
> >
> >  # Don't compile the ctype glue code, since there is no old non-GNU C library.
> > diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
> > index b157c5126c..2b0b8b6368 100644
> > --- a/sysdeps/unix/sysv/linux/check_pf.c
> > +++ b/sysdeps/unix/sysv/linux/check_pf.c
> > @@ -292,6 +292,14 @@ make_request (int fd, pid_t pid)
> >    return NULL;
> >  }
> >
> > +#ifdef __EXCEPTIONS
> > +static void
> > +cancel_handler (void *arg __attribute__((unused)))
> > +{
> > +  /* Release the lock.  */
> > +  __libc_lock_unlock (lock);
> > +}
> > +#endif
> >
> >  void
> >  attribute_hidden
> > @@ -304,6 +312,10 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> >    struct cached_data *olddata = NULL;
> >    struct cached_data *data = NULL;
> >
> > +#ifdef __EXCEPTIONS
> > +  /* Make sure that lock is released when the thread is cancelled.  */
> > +  __libc_cleanup_push (cancel_handler, NULL);
> > +#endif
> >    __libc_lock_lock (lock);
> >
> >    if (cache_valid_p ())
> > @@ -338,6 +350,9 @@ __check_pf (bool *seen_ipv4, bool *seen_ipv6,
> >         }
> >      }
> >
> > +#ifdef __EXCEPTIONS
> > +  __libc_cleanup_pop (0);
> > +#endif
> >    __libc_lock_unlock (lock);
> >
> >    if (data != NULL)
> > --
> > 2.40.0
> >
>
> LGTM.

OK for release branches?

Thanks.
Florian Weimer May 23, 2023, 8:58 a.m. UTC | #11
* H. J. Lu via Libc-stable:

> OK for release branches?

It looks backportable to me.

Thanks,
Florian
Noah Goldstein May 23, 2023, 4:56 p.m. UTC | #12
On Tue, May 23, 2023 at 3:58 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu via Libc-stable:
>
> > OK for release branches?
>
> It looks backportable to me.
+1
>
> Thanks,
> Florian
>
diff mbox series

Patch

diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index aec7a94785..0160be8790 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -529,6 +529,8 @@  sysdep_headers += \
 sysdep_routines += \
   netlink_assert_response \
   # sysdep_routines
+
+CFLAGS-check_pf.c += -fexceptions
 endif
 
 # Don't compile the ctype glue code, since there is no old non-GNU C library.
diff --git a/sysdeps/unix/sysv/linux/check_pf.c b/sysdeps/unix/sysv/linux/check_pf.c
index b157c5126c..2b0b8b6368 100644
--- a/sysdeps/unix/sysv/linux/check_pf.c
+++ b/sysdeps/unix/sysv/linux/check_pf.c
@@ -292,6 +292,14 @@  make_request (int fd, pid_t pid)
   return NULL;
 }
 
+#ifdef __EXCEPTIONS
+static void
+cancel_handler (void *arg __attribute__((unused)))
+{
+  /* Release the lock.  */
+  __libc_lock_unlock (lock);
+}
+#endif
 
 void
 attribute_hidden
@@ -304,6 +312,10 @@  __check_pf (bool *seen_ipv4, bool *seen_ipv6,
   struct cached_data *olddata = NULL;
   struct cached_data *data = NULL;
 
+#ifdef __EXCEPTIONS
+  /* Make sure that lock is released when the thread is cancelled.  */
+  __libc_cleanup_push (cancel_handler, NULL);
+#endif
   __libc_lock_lock (lock);
 
   if (cache_valid_p ())
@@ -338,6 +350,9 @@  __check_pf (bool *seen_ipv4, bool *seen_ipv6,
 	}
     }
 
+#ifdef __EXCEPTIONS
+  __libc_cleanup_pop (0);
+#endif
   __libc_lock_unlock (lock);
 
   if (data != NULL)