Message ID | 20230711190722.4028821-2-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | Use introsort for qsort | expand |
On Tue, Jul 11, 2023 at 2:08 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org> wrote: > > It optimizes take in consideration both the most common elements are > either 32 or 64 bit in size and inputs are aligned to the word > boundary. This is similar to the optimization done on lib/sort.c > from Linux. > > This patchs adds an optimized swap operation on qsort based in previous > msort one. Instead of byte operation, three variants are provided: > > 1. Using uint32_t loads and stores. > 2. Using uint64_t loads and stores. > 3. Generic one with a temporary buffer and memcpy/mempcpy. > > The 1. and 2. options are selected only if base pointer is aligned > to required type. > > It also fixes BZ#19305 by checking input size against number of > elements 1 besides 0. > > Checked on x86_64-linux-gnu. > --- > stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 95 insertions(+), 18 deletions(-) > > diff --git a/stdlib/qsort.c b/stdlib/qsort.c > index 728a0ed370..8a3331fdb4 100644 > --- a/stdlib/qsort.c > +++ b/stdlib/qsort.c > @@ -23,20 +23,89 @@ > #include <limits.h> > #include <stdlib.h> > #include <string.h> > +#include <stdbool.h> > +#include <libc-pointer-arith.h> > > -/* Byte-wise swap two items of size SIZE. */ > -#define SWAP(a, b, size) \ > - do \ > - { \ > - size_t __size = (size); \ > - char *__a = (a), *__b = (b); \ > - do \ > - { \ > - char __tmp = *__a; \ > - *__a++ = *__b; \ > - *__b++ = __tmp; \ > - } while (--__size > 0); \ > - } while (0) > +/* Swap SIZE bytes between addresses A and B. These helpers are provided > + along the generic one as an optimization. */ > + > +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); > + > +#define SWAP_WORDS_64 (swap_func_t)0 > +#define SWAP_WORDS_32 (swap_func_t)1 > +#define SWAP_BYTES (swap_func_t)2 > + Why both making this a function pointer? Why not just make it an enum? > +/* Returns true if elements can be copied using word loads and stores. > + The SIZE and BASE must be a multiple of the ALIGN. */ > +__attribute_const__ __always_inline > +static bool > +is_aligned (const void *base, size_t size, unsigned char align) > +{ > + unsigned char lsbits = (unsigned char) size; > + > + lsbits |= (unsigned char)(uintptr_t) base; > + return (lsbits & (align - 1)) == 0; > +} > + > +static void > +swap_words_64 (void * restrict a, void * restrict b, size_t n) > +{ > + do > + { > + n -= 8; > + uint64_t t = *(uint64_t *)(a + n); > + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); > + *(uint64_t *)(b + n) = t; > + } while (n); > +} > + > +static void > +swap_words_32 (void * restrict a, void * restrict b, size_t n) > +{ > + do > + { > + n -= 4; > + uint32_t t = *(uint32_t *)(a + n); > + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); > + *(uint32_t *)(b + n) = t; > + } while (n); > +} > + > +static void > +swap_bytes (void * restrict a, void * restrict b, size_t n) > +{ > + /* Use multiple small memcpys with constant size to enable inlining > + on most targets. */ > + enum { SWAP_GENERIC_SIZE = 32 }; > + unsigned char tmp[SWAP_GENERIC_SIZE]; > + while (n > SWAP_GENERIC_SIZE) > + { > + __builtin_memcpy (tmp, a, SWAP_GENERIC_SIZE); > + a = __mempcpy (a, b, SWAP_GENERIC_SIZE); > + b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE); > + n -= SWAP_GENERIC_SIZE; > + } > + while (n > 0) > + { > + unsigned char t = ((unsigned char *)a)[--n]; > + ((unsigned char *)a)[n] = ((unsigned char *)b)[n]; > + ((unsigned char *)b)[n] = t; > + } > +} > + > +/* Replace the indirect call with a serie of if statements. It should help > + the branch predictor. */ > +static void > +do_swap (void * restrict a, void * restrict b, size_t size, > + swap_func_t swap_func) > +{ > + if (swap_func == SWAP_WORDS_64) > + swap_words_64 (a, b, size); > + else if (swap_func == SWAP_WORDS_32) > + swap_words_32 (a, b, size); > + else > + swap_bytes (a, b, size); > +} > > /* Discontinue quicksort algorithm when partition gets below this size. > This particular magic number was chosen to work best on a Sun 4/260. */ > @@ -96,6 +165,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > /* Avoid lossage with unsigned arithmetic below. */ > return; > > + swap_func_t swap_func; > + if (is_aligned (pbase, size, sizeof (uint64_t))) > + swap_func = SWAP_WORDS_64; > + else if (is_aligned (pbase, size, sizeof (uint32_t))) > + swap_func = SWAP_WORDS_32; > + else > + swap_func = SWAP_BYTES; > + > if (total_elems > MAX_THRESH) > { > char *lo = base_ptr; > @@ -119,13 +196,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > char *mid = lo + size * ((hi - lo) / size >> 1); > > if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) > - SWAP (mid, lo, size); > + do_swap (mid, lo, size, swap_func); > if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) > - SWAP (mid, hi, size); > + do_swap (mid, hi, size, swap_func); > else > goto jump_over; > if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) > - SWAP (mid, lo, size); > + do_swap (mid, lo, size, swap_func); > jump_over:; > > left_ptr = lo + size; > @@ -144,7 +221,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > > if (left_ptr < right_ptr) > { > - SWAP (left_ptr, right_ptr, size); > + do_swap (left_ptr, right_ptr, size, swap_func); > if (mid == left_ptr) > mid = right_ptr; > else if (mid == right_ptr) > @@ -216,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, > tmp_ptr = run_ptr; > > if (tmp_ptr != base_ptr) > - SWAP (tmp_ptr, base_ptr, size); > + do_swap (tmp_ptr, base_ptr, size, swap_func); > > /* Insertion sort, running from left-hand-side up to right-hand-side. */ > > -- > 2.34.1 >
On 11/07/23 20:40, Noah Goldstein wrote: > On Tue, Jul 11, 2023 at 2:08 PM Adhemerval Zanella via Libc-alpha > <libc-alpha@sourceware.org> wrote: >> >> It optimizes take in consideration both the most common elements are >> either 32 or 64 bit in size and inputs are aligned to the word >> boundary. This is similar to the optimization done on lib/sort.c >> from Linux. >> >> This patchs adds an optimized swap operation on qsort based in previous >> msort one. Instead of byte operation, three variants are provided: >> >> 1. Using uint32_t loads and stores. >> 2. Using uint64_t loads and stores. >> 3. Generic one with a temporary buffer and memcpy/mempcpy. >> >> The 1. and 2. options are selected only if base pointer is aligned >> to required type. >> >> It also fixes BZ#19305 by checking input size against number of >> elements 1 besides 0. >> >> Checked on x86_64-linux-gnu. >> --- >> stdlib/qsort.c | 113 +++++++++++++++++++++++++++++++++++++++++-------- >> 1 file changed, 95 insertions(+), 18 deletions(-) >> >> diff --git a/stdlib/qsort.c b/stdlib/qsort.c >> index 728a0ed370..8a3331fdb4 100644 >> --- a/stdlib/qsort.c >> +++ b/stdlib/qsort.c >> @@ -23,20 +23,89 @@ >> #include <limits.h> >> #include <stdlib.h> >> #include <string.h> >> +#include <stdbool.h> >> +#include <libc-pointer-arith.h> >> >> -/* Byte-wise swap two items of size SIZE. */ >> -#define SWAP(a, b, size) \ >> - do \ >> - { \ >> - size_t __size = (size); \ >> - char *__a = (a), *__b = (b); \ >> - do \ >> - { \ >> - char __tmp = *__a; \ >> - *__a++ = *__b; \ >> - *__b++ = __tmp; \ >> - } while (--__size > 0); \ >> - } while (0) >> +/* Swap SIZE bytes between addresses A and B. These helpers are provided >> + along the generic one as an optimization. */ >> + >> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); >> + >> +#define SWAP_WORDS_64 (swap_func_t)0 >> +#define SWAP_WORDS_32 (swap_func_t)1 >> +#define SWAP_BYTES (swap_func_t)2 >> + > > Why both making this a function pointer? Why not just make it an enum? I used kernel lib/sort.c as base and it support caller to pass a function as wrapper, but since it does make sense for this patch an enum does make more sense. I will adjust to use it.
On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote: > +/* Returns true if elements can be copied using word loads and stores. > + The SIZE and BASE must be a multiple of the ALIGN. */ > +__attribute_const__ __always_inline > +static bool > +is_aligned (const void *base, size_t size, unsigned char align) > +{ > + unsigned char lsbits = (unsigned char) size; > + > + lsbits |= (unsigned char)(uintptr_t) base; > + return (lsbits & (align - 1)) == 0; > +} Doesn't the following source code generate better machine code? It does for me, anyway (x86-64). Even if the code were the same, the simplicity would be a win. static bool is_aligned (const void *base, size_t size, size_t align) { return (((uintptr_t) base | size) & (align - 1)) == 0; } > + if (is_aligned (pbase, size, sizeof (uint64_t))) > + swap_func = SWAP_WORDS_64; alignof not sizeof, in contexts like these that are talking about alignment not size.
On 12/07/23 18:04, Paul Eggert wrote: > On 2023-07-11 12:07, Adhemerval Zanella via Libc-alpha wrote: >> +/* Returns true if elements can be copied using word loads and stores. >> + The SIZE and BASE must be a multiple of the ALIGN. */ >> +__attribute_const__ __always_inline >> +static bool >> +is_aligned (const void *base, size_t size, unsigned char align) >> +{ >> + unsigned char lsbits = (unsigned char) size; >> + >> + lsbits |= (unsigned char)(uintptr_t) base; >> + return (lsbits & (align - 1)) == 0; >> +} > > Doesn't the following source code generate better machine code? It does for me, anyway (x86-64). Even if the code were the same, the simplicity would be a win. > > static bool > is_aligned (const void *base, size_t size, size_t align) > { > return (((uintptr_t) base | size) & (align - 1)) == 0; > } > Ok. > >> + if (is_aligned (pbase, size, sizeof (uint64_t))) >> + swap_func = SWAP_WORDS_64; > > alignof not sizeof, in contexts like these that are talking about alignment not size. Indeed, I will fix it.
On Thu, 13 Jul 2023, Adhemerval Zanella Netto via Libc-alpha wrote: > >> + if (is_aligned (pbase, size, sizeof (uint64_t))) > >> + swap_func = SWAP_WORDS_64; > > > > alignof not sizeof, in contexts like these that are talking about alignment not size. > > Indeed, I will fix it. Are you going to use GNU __alignof, or C11 _Alignof? One is safe. The other makes the code go *boom* on 32-bit x86 when 'size' is 4 modulo 8, coming this || close to a sneaky singular out-of-bounds write, saved only by the fact that gcc doesn't (yet) do high-level transforms in swap_words_64. (the code is not, in fact, talking solely about alignment there) Alexander
On 13/07/23 10:13, Alexander Monakov wrote: > > On Thu, 13 Jul 2023, Adhemerval Zanella Netto via Libc-alpha wrote: > >>>> + if (is_aligned (pbase, size, sizeof (uint64_t))) >>>> + swap_func = SWAP_WORDS_64; >>> >>> alignof not sizeof, in contexts like these that are talking about alignment not size. >> >> Indeed, I will fix it. > > Are you going to use GNU __alignof, or C11 _Alignof? One is safe. The other > makes the code go *boom* on 32-bit x86 when 'size' is 4 modulo 8, coming > this || close to a sneaky singular out-of-bounds write, saved only by the > fact that gcc doesn't (yet) do high-level transforms in swap_words_64. > > (the code is not, in fact, talking solely about alignment there) Yes I noted if fails on i686, I will use explicit value instead (as Linux lib/sort.c does).
diff --git a/stdlib/qsort.c b/stdlib/qsort.c index 728a0ed370..8a3331fdb4 100644 --- a/stdlib/qsort.c +++ b/stdlib/qsort.c @@ -23,20 +23,89 @@ #include <limits.h> #include <stdlib.h> #include <string.h> +#include <stdbool.h> +#include <libc-pointer-arith.h> -/* Byte-wise swap two items of size SIZE. */ -#define SWAP(a, b, size) \ - do \ - { \ - size_t __size = (size); \ - char *__a = (a), *__b = (b); \ - do \ - { \ - char __tmp = *__a; \ - *__a++ = *__b; \ - *__b++ = __tmp; \ - } while (--__size > 0); \ - } while (0) +/* Swap SIZE bytes between addresses A and B. These helpers are provided + along the generic one as an optimization. */ + +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); + +#define SWAP_WORDS_64 (swap_func_t)0 +#define SWAP_WORDS_32 (swap_func_t)1 +#define SWAP_BYTES (swap_func_t)2 + +/* Returns true if elements can be copied using word loads and stores. + The SIZE and BASE must be a multiple of the ALIGN. */ +__attribute_const__ __always_inline +static bool +is_aligned (const void *base, size_t size, unsigned char align) +{ + unsigned char lsbits = (unsigned char) size; + + lsbits |= (unsigned char)(uintptr_t) base; + return (lsbits & (align - 1)) == 0; +} + +static void +swap_words_64 (void * restrict a, void * restrict b, size_t n) +{ + do + { + n -= 8; + uint64_t t = *(uint64_t *)(a + n); + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); + *(uint64_t *)(b + n) = t; + } while (n); +} + +static void +swap_words_32 (void * restrict a, void * restrict b, size_t n) +{ + do + { + n -= 4; + uint32_t t = *(uint32_t *)(a + n); + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); + *(uint32_t *)(b + n) = t; + } while (n); +} + +static void +swap_bytes (void * restrict a, void * restrict b, size_t n) +{ + /* Use multiple small memcpys with constant size to enable inlining + on most targets. */ + enum { SWAP_GENERIC_SIZE = 32 }; + unsigned char tmp[SWAP_GENERIC_SIZE]; + while (n > SWAP_GENERIC_SIZE) + { + __builtin_memcpy (tmp, a, SWAP_GENERIC_SIZE); + a = __mempcpy (a, b, SWAP_GENERIC_SIZE); + b = __mempcpy (b, tmp, SWAP_GENERIC_SIZE); + n -= SWAP_GENERIC_SIZE; + } + while (n > 0) + { + unsigned char t = ((unsigned char *)a)[--n]; + ((unsigned char *)a)[n] = ((unsigned char *)b)[n]; + ((unsigned char *)b)[n] = t; + } +} + +/* Replace the indirect call with a serie of if statements. It should help + the branch predictor. */ +static void +do_swap (void * restrict a, void * restrict b, size_t size, + swap_func_t swap_func) +{ + if (swap_func == SWAP_WORDS_64) + swap_words_64 (a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32 (a, b, size); + else + swap_bytes (a, b, size); +} /* Discontinue quicksort algorithm when partition gets below this size. This particular magic number was chosen to work best on a Sun 4/260. */ @@ -96,6 +165,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, /* Avoid lossage with unsigned arithmetic below. */ return; + swap_func_t swap_func; + if (is_aligned (pbase, size, sizeof (uint64_t))) + swap_func = SWAP_WORDS_64; + else if (is_aligned (pbase, size, sizeof (uint32_t))) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + if (total_elems > MAX_THRESH) { char *lo = base_ptr; @@ -119,13 +196,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, char *mid = lo + size * ((hi - lo) / size >> 1); if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) - SWAP (mid, lo, size); + do_swap (mid, lo, size, swap_func); if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) - SWAP (mid, hi, size); + do_swap (mid, hi, size, swap_func); else goto jump_over; if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) - SWAP (mid, lo, size); + do_swap (mid, lo, size, swap_func); jump_over:; left_ptr = lo + size; @@ -144,7 +221,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, if (left_ptr < right_ptr) { - SWAP (left_ptr, right_ptr, size); + do_swap (left_ptr, right_ptr, size, swap_func); if (mid == left_ptr) mid = right_ptr; else if (mid == right_ptr) @@ -216,7 +293,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, tmp_ptr = run_ptr; if (tmp_ptr != base_ptr) - SWAP (tmp_ptr, base_ptr, size); + do_swap (tmp_ptr, base_ptr, size, swap_func); /* Insertion sort, running from left-hand-side up to right-hand-side. */