diff mbox series

msort: Get rid of alloca.

Message ID 20230630172636.384922-1-josimmon@redhat.com
State New
Headers show
Series msort: Get rid of alloca. | expand

Commit Message

Joe Simmons-Talbott June 30, 2023, 5:26 p.m. UTC
Use a scratch_buffer rather than alloca/malloc to avoid potential stack
overflow.
---
 stdlib/msort.c | 94 +++++++++++++++++++++++---------------------------
 1 file changed, 43 insertions(+), 51 deletions(-)

Comments

Paul Eggert June 30, 2023, 8:12 p.m. UTC | #1
On 2023-06-30 10:26, Joe Simmons-Talbott via Libc-alpha wrote:

> +  /* If the memory requirements are too high don't allocate memory.  */
> +  if (size / pagesize > (size_t) phys_pages)
> +    {
> +      _quicksort (b, n, s, cmp, arg);
> +      return;
> +    }
> ...
> +  if (!scratch_buffer_set_array_size (&buf, 1, size))
> +    {
> +      /* Couldn't get space, so use the slower algorithm
> +         that doesn't need a temporary array.  */
> +      _quicksort (b, n, s, cmp, arg);
> +      return;
>       }

Please combine the two ifs into one, since their then-parts are the same 
and that will make the code easier to follow.


> +  scratch_buffer_free (&buf);

Dumb question: can we arrange for scratch_buffer_free to be called even 
if qsort's comparison function longjmps out of qsort? I realize the old 
code leaked in that case, but it'd be nice if the new code didn't leak 
too. This sort of longjmp sometimes happens in real code, e.g., in GNU 'ls'.
Joe Simmons-Talbott July 3, 2023, 4:08 p.m. UTC | #2
On Fri, Jun 30, 2023 at 01:12:28PM -0700, Paul Eggert wrote:
> On 2023-06-30 10:26, Joe Simmons-Talbott via Libc-alpha wrote:
> 
> > +  /* If the memory requirements are too high don't allocate memory.  */
> > +  if (size / pagesize > (size_t) phys_pages)
> > +    {
> > +      _quicksort (b, n, s, cmp, arg);
> > +      return;
> > +    }
> > ...
> > +  if (!scratch_buffer_set_array_size (&buf, 1, size))
> > +    {
> > +      /* Couldn't get space, so use the slower algorithm
> > +         that doesn't need a temporary array.  */
> > +      _quicksort (b, n, s, cmp, arg);
> > +      return;
> >       }
> 
> Please combine the two ifs into one, since their then-parts are the same and
> that will make the code easier to follow.

I'll do this in v2.  Thanks for the suggestion and the review.

> 
> 
> > +  scratch_buffer_free (&buf);
> 
> Dumb question: can we arrange for scratch_buffer_free to be called even if
> qsort's comparison function longjmps out of qsort? I realize the old code
> leaked in that case, but it'd be nice if the new code didn't leak too. This
> sort of longjmp sometimes happens in real code, e.g., in GNU 'ls'.
> 

I'll have to defer to someone more knowledgable than me on this one.  My
understanding is that longjmp resets the stack pointer and doesn't call
any GCC cleanup handlers thus ruling out the one option I was able to
think of.  Does anyone else have thoughts on this?

Thanks,
Joe
Joe Simmons-Talbott July 3, 2023, 6:25 p.m. UTC | #3
On Fri, Jun 30, 2023 at 01:26:36PM -0400, Joe Simmons-Talbott wrote:
> Use a scratch_buffer rather than alloca/malloc to avoid potential stack
> overflow.

This patch failed the check on arm[1] but I'm unable to replicate that
locally with build-many-glibcs.py.  Does anyone have any suggestions for
replicating these testcase failures? 

Thanks,
Joe

[1] https://patchwork.sourceware.org/project/glibc/patch/20230630172636.384922-1-josimmon@redhat.com/

> ---
>  stdlib/msort.c | 94 +++++++++++++++++++++++---------------------------
>  1 file changed, 43 insertions(+), 51 deletions(-)
> 
> diff --git a/stdlib/msort.c b/stdlib/msort.c
> index bbaa5e9f82..237dbb444f 100644
> --- a/stdlib/msort.c
> +++ b/stdlib/msort.c
> @@ -24,6 +24,7 @@
>  #include <memcopy.h>
>  #include <errno.h>
>  #include <atomic.h>
> +#include <scratch_buffer.h>
>  
>  struct msort_param
>  {
> @@ -164,71 +165,62 @@ void
>  __qsort_r (void *b, size_t n, size_t s, __compar_d_fn_t cmp, void *arg)
>  {
>    size_t size = n * s;
> -  char *tmp = NULL;
>    struct msort_param p;
> +  struct scratch_buffer buf;
> +  scratch_buffer_init (&buf);
>  
>    /* For large object sizes use indirect sorting.  */
>    if (s > 32)
>      size = 2 * n * sizeof (void *) + s;
>  
> -  if (size < 1024)
> -    /* The temporary array is small, so put it on the stack.  */
> -    p.t = __alloca (size);
> -  else
> -    {
> -      /* We should avoid allocating too much memory since this might
> -	 have to be backed up by swap space.  */
> -      static long int phys_pages;
> -      static int pagesize;
> +  /* We should avoid allocating too much memory since this might
> +     have to be backed up by swap space.  */
> +  static long int phys_pages;
> +  static int pagesize;
>  
> -      if (pagesize == 0)
> -	{
> -	  phys_pages = __sysconf (_SC_PHYS_PAGES);
> +  if (pagesize == 0)
> +    {
> +      phys_pages = __sysconf (_SC_PHYS_PAGES);
>  
> -	  if (phys_pages == -1)
> -	    /* Error while determining the memory size.  So let's
> -	       assume there is enough memory.  Otherwise the
> -	       implementer should provide a complete implementation of
> -	       the `sysconf' function.  */
> -	    phys_pages = (long int) (~0ul >> 1);
> +      if (phys_pages == -1)
> +        /* Error while determining the memory size.  So let's
> +           assume there is enough memory.  Otherwise the
> +           implementer should provide a complete implementation of
> +           the `sysconf' function.  */
> +        phys_pages = (long int) (~0ul >> 1);
>  
> -	  /* The following determines that we will never use more than
> -	     a quarter of the physical memory.  */
> -	  phys_pages /= 4;
> +      /* The following determines that we will never use more than
> +         a quarter of the physical memory.  */
> +      phys_pages /= 4;
>  
> -	  /* Make sure phys_pages is written to memory.  */
> -	  atomic_write_barrier ();
> +      /* Make sure phys_pages is written to memory.  */
> +      atomic_write_barrier ();
>  
> -	  pagesize = __sysconf (_SC_PAGESIZE);
> -	}
> +      pagesize = __sysconf (_SC_PAGESIZE);
> +    }
>  
> -      /* Just a comment here.  We cannot compute
> -	   phys_pages * pagesize
> -	   and compare the needed amount of memory against this value.
> -	   The problem is that some systems might have more physical
> -	   memory then can be represented with a `size_t' value (when
> -	   measured in bytes.  */
> +  /* Just a comment here.  We cannot compute
> +       phys_pages * pagesize
> +       and compare the needed amount of memory against this value.
> +       The problem is that some systems might have more physical
> +       memory then can be represented with a `size_t' value (when
> +       measured in bytes.  */
>  
> -      /* If the memory requirements are too high don't allocate memory.  */
> -      if (size / pagesize > (size_t) phys_pages)
> -	{
> -	  _quicksort (b, n, s, cmp, arg);
> -	  return;
> -	}
> +  /* If the memory requirements are too high don't allocate memory.  */
> +  if (size / pagesize > (size_t) phys_pages)
> +    {
> +      _quicksort (b, n, s, cmp, arg);
> +      return;
> +    }
>  
> -      /* It's somewhat large, so malloc it.  */
> -      int save = errno;
> -      tmp = malloc (size);
> -      __set_errno (save);
> -      if (tmp == NULL)
> -	{
> -	  /* Couldn't get space, so use the slower algorithm
> -	     that doesn't need a temporary array.  */
> -	  _quicksort (b, n, s, cmp, arg);
> -	  return;
> -	}
> -      p.t = tmp;
> +  if (!scratch_buffer_set_array_size (&buf, 1, size))
> +    {
> +      /* Couldn't get space, so use the slower algorithm
> +         that doesn't need a temporary array.  */
> +      _quicksort (b, n, s, cmp, arg);
> +      return;
>      }
> +  p.t = buf.data;
>  
>    p.s = s;
>    p.var = 4;
> @@ -295,7 +287,7 @@ __qsort_r (void *b, size_t n, size_t s, __compar_d_fn_t cmp, void *arg)
>  	}
>        msort_with_tmp (&p, b, n);
>      }
> -  free (tmp);
> +  scratch_buffer_free (&buf);
>  }
>  libc_hidden_def (__qsort_r)
>  weak_alias (__qsort_r, qsort_r)
> -- 
> 2.39.2
>
Adhemerval Zanella Netto July 6, 2023, 1:43 p.m. UTC | #4
On 03/07/23 13:08, Joe Simmons-Talbott via Libc-alpha wrote:
> On Fri, Jun 30, 2023 at 01:12:28PM -0700, Paul Eggert wrote:
>> On 2023-06-30 10:26, Joe Simmons-Talbott via Libc-alpha wrote:
>>
>>> +  /* If the memory requirements are too high don't allocate memory.  */
>>> +  if (size / pagesize > (size_t) phys_pages)
>>> +    {
>>> +      _quicksort (b, n, s, cmp, arg);
>>> +      return;
>>> +    }
>>> ...
>>> +  if (!scratch_buffer_set_array_size (&buf, 1, size))
>>> +    {
>>> +      /* Couldn't get space, so use the slower algorithm
>>> +         that doesn't need a temporary array.  */
>>> +      _quicksort (b, n, s, cmp, arg);
>>> +      return;
>>>       }
>>
>> Please combine the two ifs into one, since their then-parts are the same and
>> that will make the code easier to follow.
> 
> I'll do this in v2.  Thanks for the suggestion and the review.
> 
>>
>>
>>> +  scratch_buffer_free (&buf);
>>
>> Dumb question: can we arrange for scratch_buffer_free to be called even if
>> qsort's comparison function longjmps out of qsort? I realize the old code
>> leaked in that case, but it'd be nice if the new code didn't leak too. This
>> sort of longjmp sometimes happens in real code, e.g., in GNU 'ls'.
>>
> 
> I'll have to defer to someone more knowledgable than me on this one.  My
> understanding is that longjmp resets the stack pointer and doesn't call
> any GCC cleanup handlers thus ruling out the one option I was able to
> think of.  Does anyone else have thoughts on this?

Another option would to just remove malloc from qsort usage, by using the
quicksort implementation instead.  To avoid the worse case we fallback to
a simple heapsort, as some of C++ implementation does.  This has the advantage
of fixing the possible longjmp issue and making the qsort full async-safe.
Joe Simmons-Talbott July 6, 2023, 7:17 p.m. UTC | #5
On Thu, Jul 06, 2023 at 10:43:42AM -0300, Adhemerval Zanella Netto wrote:
> 
> 
> On 03/07/23 13:08, Joe Simmons-Talbott via Libc-alpha wrote:
> > On Fri, Jun 30, 2023 at 01:12:28PM -0700, Paul Eggert wrote:
> >> On 2023-06-30 10:26, Joe Simmons-Talbott via Libc-alpha wrote:
> >>
> >>> +  /* If the memory requirements are too high don't allocate memory.  */
> >>> +  if (size / pagesize > (size_t) phys_pages)
> >>> +    {
> >>> +      _quicksort (b, n, s, cmp, arg);
> >>> +      return;
> >>> +    }
> >>> ...
> >>> +  if (!scratch_buffer_set_array_size (&buf, 1, size))
> >>> +    {
> >>> +      /* Couldn't get space, so use the slower algorithm
> >>> +         that doesn't need a temporary array.  */
> >>> +      _quicksort (b, n, s, cmp, arg);
> >>> +      return;
> >>>       }
> >>
> >> Please combine the two ifs into one, since their then-parts are the same and
> >> that will make the code easier to follow.
> > 
> > I'll do this in v2.  Thanks for the suggestion and the review.
> > 
> >>
> >>
> >>> +  scratch_buffer_free (&buf);
> >>
> >> Dumb question: can we arrange for scratch_buffer_free to be called even if
> >> qsort's comparison function longjmps out of qsort? I realize the old code
> >> leaked in that case, but it'd be nice if the new code didn't leak too. This
> >> sort of longjmp sometimes happens in real code, e.g., in GNU 'ls'.
> >>
> > 
> > I'll have to defer to someone more knowledgable than me on this one.  My
> > understanding is that longjmp resets the stack pointer and doesn't call
> > any GCC cleanup handlers thus ruling out the one option I was able to
> > think of.  Does anyone else have thoughts on this?
> 
> Another option would to just remove malloc from qsort usage, by using the
> quicksort implementation instead.  To avoid the worse case we fallback to
> a simple heapsort, as some of C++ implementation does.  This has the advantage
> of fixing the possible longjmp issue and making the qsort full async-safe.
> 

From my understanding there are three cases in __qsort_r; small where we
use alloca, medium where we use malloc, and large where we use
_quicksort.  This would lead to always using _quicksort, if I understand
correctly.  _quicksort reduces the probability of the worst case by
choosing the median as the pivot.  Am I missing something?

Thanks,
Joe
Xi Ruoyao July 6, 2023, 7:29 p.m. UTC | #6
On Thu, 2023-07-06 at 15:17 -0400, Joe Simmons-Talbott via Libc-alpha wrote:
> On Thu, Jul 06, 2023 at 10:43:42AM -0300, Adhemerval Zanella Netto wrote:
> > 
> > 
> > On 03/07/23 13:08, Joe Simmons-Talbott via Libc-alpha wrote:
> > > On Fri, Jun 30, 2023 at 01:12:28PM -0700, Paul Eggert wrote:
> > > > On 2023-06-30 10:26, Joe Simmons-Talbott via Libc-alpha wrote:
> > > > 
> > > > > +  /* If the memory requirements are too high don't allocate memory.  */
> > > > > +  if (size / pagesize > (size_t) phys_pages)
> > > > > +    {
> > > > > +      _quicksort (b, n, s, cmp, arg);
> > > > > +      return;
> > > > > +    }
> > > > > ...
> > > > > +  if (!scratch_buffer_set_array_size (&buf, 1, size))
> > > > > +    {
> > > > > +      /* Couldn't get space, so use the slower algorithm
> > > > > +         that doesn't need a temporary array.  */
> > > > > +      _quicksort (b, n, s, cmp, arg);
> > > > > +      return;
> > > > >       }
> > > > 
> > > > Please combine the two ifs into one, since their then-parts are the same and
> > > > that will make the code easier to follow.
> > > 
> > > I'll do this in v2.  Thanks for the suggestion and the review.
> > > 
> > > > 
> > > > 
> > > > > +  scratch_buffer_free (&buf);
> > > > 
> > > > Dumb question: can we arrange for scratch_buffer_free to be called even if
> > > > qsort's comparison function longjmps out of qsort? I realize the old code
> > > > leaked in that case, but it'd be nice if the new code didn't leak too. This
> > > > sort of longjmp sometimes happens in real code, e.g., in GNU 'ls'.
> > > > 
> > > 
> > > I'll have to defer to someone more knowledgable than me on this one.  My
> > > understanding is that longjmp resets the stack pointer and doesn't call
> > > any GCC cleanup handlers thus ruling out the one option I was able to
> > > think of.  Does anyone else have thoughts on this?
> > 
> > Another option would to just remove malloc from qsort usage, by using the
> > quicksort implementation instead.  To avoid the worse case we fallback to
> > a simple heapsort, as some of C++ implementation does.  This has the advantage
> > of fixing the possible longjmp issue and making the qsort full async-safe.
> > 
> 
> From my understanding there are three cases in __qsort_r; small where we
> use alloca, medium where we use malloc, and large where we use
> _quicksort.  This would lead to always using _quicksort, if I understand
> correctly.  _quicksort reduces the probability of the worst case by
> choosing the median as the pivot.  Am I missing something?

"To avoid the worse case we fallback to a simple heapsort".  This is not
implemented in _quicksort yet.

See https://en.wikipedia.org/wiki/Introsort.
Adhemerval Zanella Netto July 7, 2023, 1:56 p.m. UTC | #7
On 06/07/23 16:29, Xi Ruoyao wrote:
> On Thu, 2023-07-06 at 15:17 -0400, Joe Simmons-Talbott via Libc-alpha wrote:
>> On Thu, Jul 06, 2023 at 10:43:42AM -0300, Adhemerval Zanella Netto wrote:
>>>
>>>
>>> On 03/07/23 13:08, Joe Simmons-Talbott via Libc-alpha wrote:
>>>> On Fri, Jun 30, 2023 at 01:12:28PM -0700, Paul Eggert wrote:
>>>>> On 2023-06-30 10:26, Joe Simmons-Talbott via Libc-alpha wrote:
>>>>>
>>>>>> +  /* If the memory requirements are too high don't allocate memory.  */
>>>>>> +  if (size / pagesize > (size_t) phys_pages)
>>>>>> +    {
>>>>>> +      _quicksort (b, n, s, cmp, arg);
>>>>>> +      return;
>>>>>> +    }
>>>>>> ...
>>>>>> +  if (!scratch_buffer_set_array_size (&buf, 1, size))
>>>>>> +    {
>>>>>> +      /* Couldn't get space, so use the slower algorithm
>>>>>> +         that doesn't need a temporary array.  */
>>>>>> +      _quicksort (b, n, s, cmp, arg);
>>>>>> +      return;
>>>>>>       }
>>>>>
>>>>> Please combine the two ifs into one, since their then-parts are the same and
>>>>> that will make the code easier to follow.
>>>>
>>>> I'll do this in v2.  Thanks for the suggestion and the review.
>>>>
>>>>>
>>>>>
>>>>>> +  scratch_buffer_free (&buf);
>>>>>
>>>>> Dumb question: can we arrange for scratch_buffer_free to be called even if
>>>>> qsort's comparison function longjmps out of qsort? I realize the old code
>>>>> leaked in that case, but it'd be nice if the new code didn't leak too. This
>>>>> sort of longjmp sometimes happens in real code, e.g., in GNU 'ls'.
>>>>>
>>>>
>>>> I'll have to defer to someone more knowledgable than me on this one.  My
>>>> understanding is that longjmp resets the stack pointer and doesn't call
>>>> any GCC cleanup handlers thus ruling out the one option I was able to
>>>> think of.  Does anyone else have thoughts on this?
>>>
>>> Another option would to just remove malloc from qsort usage, by using the
>>> quicksort implementation instead.  To avoid the worse case we fallback to
>>> a simple heapsort, as some of C++ implementation does.  This has the advantage
>>> of fixing the possible longjmp issue and making the qsort full async-safe.
>>>
>>
>> From my understanding there are three cases in __qsort_r; small where we
>> use alloca, medium where we use malloc, and large where we use
>> _quicksort.  This would lead to always using _quicksort, if I understand
>> correctly.  _quicksort reduces the probability of the worst case by
>> choosing the median as the pivot.  Am I missing something?
> 
> "To avoid the worse case we fallback to a simple heapsort".  This is not
> implemented in _quicksort yet.
> 
> See https://en.wikipedia.org/wiki/Introsort.

The issue is mergesort is a stable sort and usually much faster than
quicksort, so changing the implementation to introsort will have performance
impacts.  That's the main issue Paul has brought in a previous attempt [1],
as he suggested that instead we provide a better API instead.  I still think
this is an orthogonal issue, we should instead strive on glibc to provide
a simpler implementation that works on most scenarios without adding significant
drawnbacks (that's why I suggested a non quadratic implementation without
the need to allocate extra memory).

And it seems that FreeBSD developers is also moving toward this direction.
They tried to improve bsort a bitonic type sorting, but revert it instead [2]
with an important remark:

  libc is not the right place for sorting algorithms

So I still think that we should provide better sorting algorithms through
a different project (maybe gnulib itself).

[1] https://sourceware.org/pipermail/libc-alpha/2021-September/130698.html
[2] https://github.com/freebsd/freebsd-src/commit/bb8e8e230d94c9522bd9ff95c13dc9f5b1592929
diff mbox series

Patch

diff --git a/stdlib/msort.c b/stdlib/msort.c
index bbaa5e9f82..237dbb444f 100644
--- a/stdlib/msort.c
+++ b/stdlib/msort.c
@@ -24,6 +24,7 @@ 
 #include <memcopy.h>
 #include <errno.h>
 #include <atomic.h>
+#include <scratch_buffer.h>
 
 struct msort_param
 {
@@ -164,71 +165,62 @@  void
 __qsort_r (void *b, size_t n, size_t s, __compar_d_fn_t cmp, void *arg)
 {
   size_t size = n * s;
-  char *tmp = NULL;
   struct msort_param p;
+  struct scratch_buffer buf;
+  scratch_buffer_init (&buf);
 
   /* For large object sizes use indirect sorting.  */
   if (s > 32)
     size = 2 * n * sizeof (void *) + s;
 
-  if (size < 1024)
-    /* The temporary array is small, so put it on the stack.  */
-    p.t = __alloca (size);
-  else
-    {
-      /* We should avoid allocating too much memory since this might
-	 have to be backed up by swap space.  */
-      static long int phys_pages;
-      static int pagesize;
+  /* We should avoid allocating too much memory since this might
+     have to be backed up by swap space.  */
+  static long int phys_pages;
+  static int pagesize;
 
-      if (pagesize == 0)
-	{
-	  phys_pages = __sysconf (_SC_PHYS_PAGES);
+  if (pagesize == 0)
+    {
+      phys_pages = __sysconf (_SC_PHYS_PAGES);
 
-	  if (phys_pages == -1)
-	    /* Error while determining the memory size.  So let's
-	       assume there is enough memory.  Otherwise the
-	       implementer should provide a complete implementation of
-	       the `sysconf' function.  */
-	    phys_pages = (long int) (~0ul >> 1);
+      if (phys_pages == -1)
+        /* Error while determining the memory size.  So let's
+           assume there is enough memory.  Otherwise the
+           implementer should provide a complete implementation of
+           the `sysconf' function.  */
+        phys_pages = (long int) (~0ul >> 1);
 
-	  /* The following determines that we will never use more than
-	     a quarter of the physical memory.  */
-	  phys_pages /= 4;
+      /* The following determines that we will never use more than
+         a quarter of the physical memory.  */
+      phys_pages /= 4;
 
-	  /* Make sure phys_pages is written to memory.  */
-	  atomic_write_barrier ();
+      /* Make sure phys_pages is written to memory.  */
+      atomic_write_barrier ();
 
-	  pagesize = __sysconf (_SC_PAGESIZE);
-	}
+      pagesize = __sysconf (_SC_PAGESIZE);
+    }
 
-      /* Just a comment here.  We cannot compute
-	   phys_pages * pagesize
-	   and compare the needed amount of memory against this value.
-	   The problem is that some systems might have more physical
-	   memory then can be represented with a `size_t' value (when
-	   measured in bytes.  */
+  /* Just a comment here.  We cannot compute
+       phys_pages * pagesize
+       and compare the needed amount of memory against this value.
+       The problem is that some systems might have more physical
+       memory then can be represented with a `size_t' value (when
+       measured in bytes.  */
 
-      /* If the memory requirements are too high don't allocate memory.  */
-      if (size / pagesize > (size_t) phys_pages)
-	{
-	  _quicksort (b, n, s, cmp, arg);
-	  return;
-	}
+  /* If the memory requirements are too high don't allocate memory.  */
+  if (size / pagesize > (size_t) phys_pages)
+    {
+      _quicksort (b, n, s, cmp, arg);
+      return;
+    }
 
-      /* It's somewhat large, so malloc it.  */
-      int save = errno;
-      tmp = malloc (size);
-      __set_errno (save);
-      if (tmp == NULL)
-	{
-	  /* Couldn't get space, so use the slower algorithm
-	     that doesn't need a temporary array.  */
-	  _quicksort (b, n, s, cmp, arg);
-	  return;
-	}
-      p.t = tmp;
+  if (!scratch_buffer_set_array_size (&buf, 1, size))
+    {
+      /* Couldn't get space, so use the slower algorithm
+         that doesn't need a temporary array.  */
+      _quicksort (b, n, s, cmp, arg);
+      return;
     }
+  p.t = buf.data;
 
   p.s = s;
   p.var = 4;
@@ -295,7 +287,7 @@  __qsort_r (void *b, size_t n, size_t s, __compar_d_fn_t cmp, void *arg)
 	}
       msort_with_tmp (&p, b, n);
     }
-  free (tmp);
+  scratch_buffer_free (&buf);
 }
 libc_hidden_def (__qsort_r)
 weak_alias (__qsort_r, qsort_r)