Message ID | 20210903171144.952737-4-adhemerval.zanella@linaro.org |
---|---|
State | New |
Headers | show |
Series | Use introsort for qsort | expand |
On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha < libc-alpha@sourceware.org> wrote: > It optimizes take in consideration both the most common elements are > either 32 or 64 bit in size [1] and inputs are aligned to the word > boundary. This is similar to the optimization done on lib/sort.c > from Linux. > > This patchs adds an optimized swap operation on qsort based in previous > msort one. Instead of byte operation, three variants are provided: > > 1. Using uint32_t loads and stores. > 2. Using uint64_t loads and stores. > 3. Generic one with a temporary buffer and memcpy/mempcpy. > > The 1. and 2. options are selected only either if architecture defines > _STRING_ARCH_unaligned or if base pointer is aligned to required type. > > It also fixes BZ#19305 by checking input size against number of > elements 1 besides 0. > > Checked on x86_64-linux-gnu. > > [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html > --- > stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 91 insertions(+), 18 deletions(-) > > diff --git a/stdlib/qsort.c b/stdlib/qsort.c > index 23f2d28314..59458d151b 100644 > --- a/stdlib/qsort.c > +++ b/stdlib/qsort.c > @@ -24,20 +24,85 @@ > #include <limits.h> > #include <stdlib.h> > #include <string.h> > +#include <stdbool.h> > > -/* Byte-wise swap two items of size SIZE. */ > -#define SWAP(a, b, size) > \ > - do > \ > - { > \ > - size_t __size = (size); > \ > - char *__a = (a), *__b = (b); > \ > - do > \ > - { > \ > - char __tmp = *__a; > \ > - *__a++ = *__b; > \ > - *__b++ = __tmp; > \ > - } while (--__size > 0); > \ > - } while (0) > +/* Swap SIZE bytes between addresses A and B. These helpers are provided > + along the generic one as an optimization. */ > + > +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); > + > +/* Return trues is elements can be copied used word load and sortes. > + The size must be a multiple of the alignment, and the base address. */ > +static inline bool > +is_aligned_to_copy (const void *base, size_t size, size_t align) > +{ > + unsigned char lsbits = size; > +#if !_STRING_ARCH_unaligned > + lsbits |= (unsigned char)(uintptr_t) base; > +#endif > + return (lsbits & (align - 1)) == 0; > +} > + > +#define SWAP_WORDS_64 (swap_func_t)0 > +#define SWAP_WORDS_32 (swap_func_t)1 > +#define SWAP_BYTES (swap_func_t)2 > + > +static void > +swap_words_64 (void * restrict a, void * restrict b, size_t n) > +{ > + do > + { > + n -= 8; > + uint64_t t = *(uint64_t *)(a + n); > + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); > + *(uint64_t *)(b + n) = t; > + } while (n); > +} > + > +static void > +swap_words_32 (void * restrict a, void * restrict b, size_t n) > +{ > + do > + { > + n -= 4; > + uint32_t t = *(uint32_t *)(a + n); > + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); > + *(uint32_t *)(b + n) = t; > + } while (n); > +} > + > +static void > +swap_bytes (void * restrict a, void * restrict b, size_t n) > +{ > + /* Use multiple small memcpys with constant size to enable inlining > + on most targets. */ > + enum { SWAP_GENERIC_SIZE = 32 }; > + unsigned char tmp[SWAP_GENERIC_SIZE]; > + while (n > SWAP_GENERIC_SIZE) > + { > + memcpy (tmp, a, SWAP_GENERIC_SIZE); > + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; > + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; > + n -= SWAP_GENERIC_SIZE; > + } > + memcpy (tmp, a, n); > + memcpy (a, b, n); > + memcpy (b, tmp, n); > +} > + > +/* Replace the indirect call with a serie of if statements. It should > help > + the branch predictor. */ > 1) Really? On Intel at least an indirect call that is always going to the same place is certainly going to be predicted as well if not better than 2/3 branches + direct call. 2) If you're going to just test which swap function to use, why bother initializing swap_func? Why not just use an int? > +static void > +do_swap (void * restrict a, void * restrict b, size_t size, > + swap_func_t swap_func) > +{ > + if (swap_func == SWAP_WORDS_64) > + swap_words_64 (a, b, size); > + else if (swap_func == SWAP_WORDS_32) > + swap_words_32 (a, b, size); > + else > + swap_bytes (a, b, size); > +} > > /* Discontinue quicksort algorithm when partition gets below this size. > This particular magic number was chosen to work best on a Sun 4/260. */ > @@ -97,6 +162,14 @@ _quicksort (void *const pbase, size_t total_elems, > size_t size, > /* Avoid lossage with unsigned arithmetic below. */ > return; > > + swap_func_t swap_func; > + if (is_aligned_to_copy (pbase, size, 8)) > + swap_func = SWAP_WORDS_64; > + else if (is_aligned_to_copy (pbase, size, 4)) > + swap_func = SWAP_WORDS_32; > + else > + swap_func = SWAP_BYTES; > + > if (total_elems > MAX_THRESH) > { > char *lo = base_ptr; > @@ -120,13 +193,13 @@ _quicksort (void *const pbase, size_t total_elems, > size_t size, > char *mid = lo + size * ((hi - lo) / size >> 1); > > if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) > - SWAP (mid, lo, size); > + do_swap (mid, lo, size, swap_func); > if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) > - SWAP (mid, hi, size); > + do_swap (mid, hi, size, swap_func); > else > goto jump_over; > if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) > - SWAP (mid, lo, size); > + do_swap (mid, lo, size, swap_func); > jump_over:; > > left_ptr = lo + size; > @@ -145,7 +218,7 @@ _quicksort (void *const pbase, size_t total_elems, > size_t size, > > if (left_ptr < right_ptr) > { > - SWAP (left_ptr, right_ptr, size); > + do_swap (left_ptr, right_ptr, size, swap_func); > if (mid == left_ptr) > mid = right_ptr; > else if (mid == right_ptr) > @@ -217,7 +290,7 @@ _quicksort (void *const pbase, size_t total_elems, > size_t size, > tmp_ptr = run_ptr; > > if (tmp_ptr != base_ptr) > - SWAP (tmp_ptr, base_ptr, size); > + do_swap (tmp_ptr, base_ptr, size, swap_func); > > /* Insertion sort, running from left-hand-side up to > right-hand-side. */ > > -- > 2.30.2 > >
On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha < > libc-alpha@sourceware.org> wrote: > >> It optimizes take in consideration both the most common elements are >> either 32 or 64 bit in size [1] and inputs are aligned to the word >> boundary. This is similar to the optimization done on lib/sort.c >> from Linux. >> >> This patchs adds an optimized swap operation on qsort based in previous >> msort one. Instead of byte operation, three variants are provided: >> >> 1. Using uint32_t loads and stores. >> 2. Using uint64_t loads and stores. >> 3. Generic one with a temporary buffer and memcpy/mempcpy. >> >> The 1. and 2. options are selected only either if architecture defines >> _STRING_ARCH_unaligned or if base pointer is aligned to required type. >> >> It also fixes BZ#19305 by checking input size against number of >> elements 1 besides 0. >> >> Checked on x86_64-linux-gnu. >> >> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html >> --- >> stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++-------- >> 1 file changed, 91 insertions(+), 18 deletions(-) >> >> diff --git a/stdlib/qsort.c b/stdlib/qsort.c >> index 23f2d28314..59458d151b 100644 >> --- a/stdlib/qsort.c >> +++ b/stdlib/qsort.c >> @@ -24,20 +24,85 @@ >> #include <limits.h> >> #include <stdlib.h> >> #include <string.h> >> +#include <stdbool.h> >> >> -/* Byte-wise swap two items of size SIZE. */ >> -#define SWAP(a, b, size) >> \ >> - do >> \ >> - { >> \ >> - size_t __size = (size); >> \ >> - char *__a = (a), *__b = (b); >> \ >> - do >> \ >> - { >> \ >> - char __tmp = *__a; >> \ >> - *__a++ = *__b; >> \ >> - *__b++ = __tmp; >> \ >> - } while (--__size > 0); >> \ >> - } while (0) >> +/* Swap SIZE bytes between addresses A and B. These helpers are provided >> + along the generic one as an optimization. */ >> + >> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); >> + >> +/* Return trues is elements can be copied used word load and sortes. >> + The size must be a multiple of the alignment, and the base address. >> */ >> +static inline bool >> +is_aligned_to_copy (const void *base, size_t size, size_t align) >> +{ >> + unsigned char lsbits = size; >> +#if !_STRING_ARCH_unaligned >> + lsbits |= (unsigned char)(uintptr_t) base; >> +#endif >> + return (lsbits & (align - 1)) == 0; >> +} >> + >> +#define SWAP_WORDS_64 (swap_func_t)0 >> +#define SWAP_WORDS_32 (swap_func_t)1 >> +#define SWAP_BYTES (swap_func_t)2 >> + >> +static void >> +swap_words_64 (void * restrict a, void * restrict b, size_t n) >> +{ >> + do >> + { >> + n -= 8; >> + uint64_t t = *(uint64_t *)(a + n); >> + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); >> + *(uint64_t *)(b + n) = t; >> + } while (n); >> +} >> + >> +static void >> +swap_words_32 (void * restrict a, void * restrict b, size_t n) >> +{ >> + do >> + { >> + n -= 4; >> + uint32_t t = *(uint32_t *)(a + n); >> + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); >> + *(uint32_t *)(b + n) = t; >> + } while (n); >> +} >> > I'm not certain swap_words_32 / swap_words_8 will be optimal for larger key sizes. Looking at GCC's implementation of swap_generic on modern x86_64: https://godbolt.org/z/638h3Y9va It's able to optimize the temporary buffer out of the loop and use xmm registers which will likely win out for larger sizes. > + >> +static void >> +swap_bytes (void * restrict a, void * restrict b, size_t n) >> +{ >> + /* Use multiple small memcpys with constant size to enable inlining >> + on most targets. */ >> + enum { SWAP_GENERIC_SIZE = 32 }; >> + unsigned char tmp[SWAP_GENERIC_SIZE]; >> + while (n > SWAP_GENERIC_SIZE) >> + { >> + memcpy (tmp, a, SWAP_GENERIC_SIZE); >> + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; >> + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; >> + n -= SWAP_GENERIC_SIZE; >> + } >> + memcpy (tmp, a, n); >> + memcpy (a, b, n); >> + memcpy (b, tmp, n); >> +} >> + >> +/* Replace the indirect call with a serie of if statements. It should >> help >> + the branch predictor. */ >> > > 1) Really? On Intel at least an indirect call that is always going to the > same place > is certainly going to be predicted as well if not better than 2/3 > branches + direct call. > > 2) If you're going to just test which swap function to use, why > bother initializing > swap_func? Why not just use an int? > > > >> +static void >> +do_swap (void * restrict a, void * restrict b, size_t size, >> + swap_func_t swap_func) >> +{ >> + if (swap_func == SWAP_WORDS_64) >> + swap_words_64 (a, b, size); >> + else if (swap_func == SWAP_WORDS_32) >> + swap_words_32 (a, b, size); >> + else >> + swap_bytes (a, b, size); >> +} >> >> /* Discontinue quicksort algorithm when partition gets below this size. >> This particular magic number was chosen to work best on a Sun 4/260. >> */ >> @@ -97,6 +162,14 @@ _quicksort (void *const pbase, size_t total_elems, >> size_t size, >> /* Avoid lossage with unsigned arithmetic below. */ >> return; >> >> + swap_func_t swap_func; >> + if (is_aligned_to_copy (pbase, size, 8)) >> + swap_func = SWAP_WORDS_64; >> + else if (is_aligned_to_copy (pbase, size, 4)) >> > For many modern architectures that support fast unaligned loads/stores (for example x86_64 SnB and newer) I don't think this check really makes sense. > + swap_func = SWAP_WORDS_32; >> + else >> + swap_func = SWAP_BYTES; >> + >> if (total_elems > MAX_THRESH) >> { >> char *lo = base_ptr; >> @@ -120,13 +193,13 @@ _quicksort (void *const pbase, size_t total_elems, >> size_t size, >> char *mid = lo + size * ((hi - lo) / size >> 1); >> >> if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) >> - SWAP (mid, lo, size); >> + do_swap (mid, lo, size, swap_func); >> if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) >> - SWAP (mid, hi, size); >> + do_swap (mid, hi, size, swap_func); >> else >> goto jump_over; >> if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) >> - SWAP (mid, lo, size); >> + do_swap (mid, lo, size, swap_func); >> jump_over:; >> >> left_ptr = lo + size; >> @@ -145,7 +218,7 @@ _quicksort (void *const pbase, size_t total_elems, >> size_t size, >> >> if (left_ptr < right_ptr) >> { >> - SWAP (left_ptr, right_ptr, size); >> + do_swap (left_ptr, right_ptr, size, swap_func); >> if (mid == left_ptr) >> mid = right_ptr; >> else if (mid == right_ptr) >> @@ -217,7 +290,7 @@ _quicksort (void *const pbase, size_t total_elems, >> size_t size, >> tmp_ptr = run_ptr; >> >> if (tmp_ptr != base_ptr) >> - SWAP (tmp_ptr, base_ptr, size); >> + do_swap (tmp_ptr, base_ptr, size, swap_func); >> >> /* Insertion sort, running from left-hand-side up to >> right-hand-side. */ >> >> -- >> 2.30.2 >> >>
On 13/10/2021 00:29, Noah Goldstein wrote: > +static void > +swap_bytes (void * restrict a, void * restrict b, size_t n) > +{ > + /* Use multiple small memcpys with constant size to enable inlining > + on most targets. */ > + enum { SWAP_GENERIC_SIZE = 32 }; > + unsigned char tmp[SWAP_GENERIC_SIZE]; > + while (n > SWAP_GENERIC_SIZE) > + { > + memcpy (tmp, a, SWAP_GENERIC_SIZE); > + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; > + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; > + n -= SWAP_GENERIC_SIZE; > + } > + memcpy (tmp, a, n); > + memcpy (a, b, n); > + memcpy (b, tmp, n); > +} > + > +/* Replace the indirect call with a serie of if statements. It should help > + the branch predictor. */ > > > 1) Really? On Intel at least an indirect call that is always going to the same place > is certainly going to be predicted as well if not better than 2/3 branches + direct call. > I shamelessly copy the same strategy Linux kernel used on its lib/sort.c (8fb583c4258d08f0). Maybe Linux internal usage of its qsort() leads to better predictable branch, and for this change I would prefer to work better on different architectures than assume an specific one. > 2) If you're going to just test which swap function to use, why bother initializing > swap_func? Why not just use an int? Indeed this is no much gain on glibc usage. The kernel provides a API to use used-defined swap_func, which is not our case.
On 13/10/2021 00:39, Noah Goldstein wrote: > > > On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote: > > > > On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote: > > It optimizes take in consideration both the most common elements are > either 32 or 64 bit in size [1] and inputs are aligned to the word > boundary. This is similar to the optimization done on lib/sort.c > from Linux. > > This patchs adds an optimized swap operation on qsort based in previous > msort one. Instead of byte operation, three variants are provided: > > 1. Using uint32_t loads and stores. > 2. Using uint64_t loads and stores. > 3. Generic one with a temporary buffer and memcpy/mempcpy. > > The 1. and 2. options are selected only either if architecture defines > _STRING_ARCH_unaligned or if base pointer is aligned to required type. > > It also fixes BZ#19305 by checking input size against number of > elements 1 besides 0. > > Checked on x86_64-linux-gnu. > > [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html> > --- > stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 91 insertions(+), 18 deletions(-) > > diff --git a/stdlib/qsort.c b/stdlib/qsort.c > index 23f2d28314..59458d151b 100644 > --- a/stdlib/qsort.c > +++ b/stdlib/qsort.c > @@ -24,20 +24,85 @@ > #include <limits.h> > #include <stdlib.h> > #include <string.h> > +#include <stdbool.h> > > -/* Byte-wise swap two items of size SIZE. */ > -#define SWAP(a, b, size) \ > - do \ > - { \ > - size_t __size = (size); \ > - char *__a = (a), *__b = (b); \ > - do \ > - { \ > - char __tmp = *__a; \ > - *__a++ = *__b; \ > - *__b++ = __tmp; \ > - } while (--__size > 0); \ > - } while (0) > +/* Swap SIZE bytes between addresses A and B. These helpers are provided > + along the generic one as an optimization. */ > + > +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); > + > +/* Return trues is elements can be copied used word load and sortes. > + The size must be a multiple of the alignment, and the base address. */ > +static inline bool > +is_aligned_to_copy (const void *base, size_t size, size_t align) > +{ > + unsigned char lsbits = size; > +#if !_STRING_ARCH_unaligned > + lsbits |= (unsigned char)(uintptr_t) base; > +#endif > + return (lsbits & (align - 1)) == 0; > +} > + > +#define SWAP_WORDS_64 (swap_func_t)0 > +#define SWAP_WORDS_32 (swap_func_t)1 > +#define SWAP_BYTES (swap_func_t)2 > + > +static void > +swap_words_64 (void * restrict a, void * restrict b, size_t n) > +{ > + do > + { > + n -= 8; > + uint64_t t = *(uint64_t *)(a + n); > + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); > + *(uint64_t *)(b + n) = t; > + } while (n); > +} > + > +static void > +swap_words_32 (void * restrict a, void * restrict b, size_t n) > +{ > + do > + { > + n -= 4; > + uint32_t t = *(uint32_t *)(a + n); > + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); > + *(uint32_t *)(b + n) = t; > + } while (n); > +} > > > I'm not certain swap_words_32 / swap_words_8 will be optimal for larger > key sizes. Looking at GCC's implementation of swap_generic on modern x86_64: > https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va> > It's able to optimize the temporary buffer out of the loop and use xmm registers which > will likely win out for larger sizes. It is probably not the most optimized code compiler can generated since I tried to go for a more generic code that should results in a somewhat better code in a architecture agnostic code. I trying to mimic some of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I used the same strategy used on d3496c9f4f27d (Linux did not optimize anything for the byte version). I think it would be possible to tune it for an specific architecture and/or compiler, but I would prefer to use a good enough algorithm that work reasonable on multiple architectures.
On Fri, Oct 15, 2021 at 8:12 AM Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > > > On 13/10/2021 00:29, Noah Goldstein wrote: > > +static void > > +swap_bytes (void * restrict a, void * restrict b, size_t n) > > +{ > > + /* Use multiple small memcpys with constant size to enable inlining > > + on most targets. */ > > + enum { SWAP_GENERIC_SIZE = 32 }; > > + unsigned char tmp[SWAP_GENERIC_SIZE]; > > + while (n > SWAP_GENERIC_SIZE) > > + { > > + memcpy (tmp, a, SWAP_GENERIC_SIZE); > > + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; > > + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; > > + n -= SWAP_GENERIC_SIZE; > > + } > > + memcpy (tmp, a, n); > > + memcpy (a, b, n); > > + memcpy (b, tmp, n); > > +} > > + > > +/* Replace the indirect call with a serie of if statements. It should help > > + the branch predictor. */ > > > > > > 1) Really? On Intel at least an indirect call that is always going to the same place > > is certainly going to be predicted as well if not better than 2/3 branches + direct call. > > > > I shamelessly copy the same strategy Linux kernel used on its lib/sort.c > (8fb583c4258d08f0). Maybe Linux internal usage of its qsort() leads to > better predictable branch, and for this change I would prefer to work > better on different architectures than assume an specific one. If it's running in the Kernel spectre mitigations can make indirect jumps particularly expensive. I don't think we have the same issue targeting userland. > > > 2) If you're going to just test which swap function to use, why bother initializing > > swap_func? Why not just use an int? > > Indeed this is no much gain on glibc usage. The kernel provides a API to use > used-defined swap_func, which is not our case. >
On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > > > On 13/10/2021 00:39, Noah Goldstein wrote: > > > > > > On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote: > > > > > > > > On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote: > > > > It optimizes take in consideration both the most common elements are > > either 32 or 64 bit in size [1] and inputs are aligned to the word > > boundary. This is similar to the optimization done on lib/sort.c > > from Linux. > > > > This patchs adds an optimized swap operation on qsort based in previous > > msort one. Instead of byte operation, three variants are provided: > > > > 1. Using uint32_t loads and stores. > > 2. Using uint64_t loads and stores. > > 3. Generic one with a temporary buffer and memcpy/mempcpy. > > > > The 1. and 2. options are selected only either if architecture defines > > _STRING_ARCH_unaligned or if base pointer is aligned to required type. > > > > It also fixes BZ#19305 by checking input size against number of > > elements 1 besides 0. > > > > Checked on x86_64-linux-gnu. > > > > [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html> > > --- > > stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++-------- > > 1 file changed, 91 insertions(+), 18 deletions(-) > > > > diff --git a/stdlib/qsort.c b/stdlib/qsort.c > > index 23f2d28314..59458d151b 100644 > > --- a/stdlib/qsort.c > > +++ b/stdlib/qsort.c > > @@ -24,20 +24,85 @@ > > #include <limits.h> > > #include <stdlib.h> > > #include <string.h> > > +#include <stdbool.h> > > > > -/* Byte-wise swap two items of size SIZE. */ > > -#define SWAP(a, b, size) \ > > - do \ > > - { \ > > - size_t __size = (size); \ > > - char *__a = (a), *__b = (b); \ > > - do \ > > - { \ > > - char __tmp = *__a; \ > > - *__a++ = *__b; \ > > - *__b++ = __tmp; \ > > - } while (--__size > 0); \ > > - } while (0) > > +/* Swap SIZE bytes between addresses A and B. These helpers are provided > > + along the generic one as an optimization. */ > > + > > +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); > > + > > +/* Return trues is elements can be copied used word load and sortes. > > + The size must be a multiple of the alignment, and the base address. */ > > +static inline bool > > +is_aligned_to_copy (const void *base, size_t size, size_t align) > > +{ > > + unsigned char lsbits = size; > > +#if !_STRING_ARCH_unaligned > > + lsbits |= (unsigned char)(uintptr_t) base; > > +#endif > > + return (lsbits & (align - 1)) == 0; > > +} > > + > > +#define SWAP_WORDS_64 (swap_func_t)0 > > +#define SWAP_WORDS_32 (swap_func_t)1 > > +#define SWAP_BYTES (swap_func_t)2 > > + > > +static void > > +swap_words_64 (void * restrict a, void * restrict b, size_t n) > > +{ > > + do > > + { > > + n -= 8; > > + uint64_t t = *(uint64_t *)(a + n); > > + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); > > + *(uint64_t *)(b + n) = t; > > + } while (n); > > +} > > + > > +static void > > +swap_words_32 (void * restrict a, void * restrict b, size_t n) > > +{ > > + do > > + { > > + n -= 4; > > + uint32_t t = *(uint32_t *)(a + n); > > + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); > > + *(uint32_t *)(b + n) = t; > > + } while (n); > > +} > > > > > > I'm not certain swap_words_32 / swap_words_8 will be optimal for larger > > key sizes. Looking at GCC's implementation of swap_generic on modern x86_64: > > https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va> > > It's able to optimize the temporary buffer out of the loop and use xmm registers which > > will likely win out for larger sizes. > > It is probably not the most optimized code compiler can generated since > I tried to go for a more generic code that should results in a somewhat > better code in a architecture agnostic code. I trying to mimic some > of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I > used the same strategy used on d3496c9f4f27d (Linux did not optimize > anything for the byte version). > > I think it would be possible to tune it for an specific architecture > and/or compiler, but I would prefer to use a good enough algorithm that > work reasonable on multiple architectures. That's fair. Although I still think there are some improvements. Looking at the assembly for all three in fact it seems GCC optimizes all of them to larger register copies: https://godbolt.org/z/bd9nnnoEY The biggest difference seems to be the setup / register spills for the generic version so for the common case of a relatively small key the special case for 4/8 makes sense. Have you checked that GCC is able to use the conditions for selecting the swap function to optimize the functions themselves? In the godbolt link I got reasonable value out of adding the invariants to swap_64/swap_32. It also may be worth it to write a custom memcpy implementation for size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably more optimized than what generic memcpy can get away with).
On 15/10/2021 13:45, Noah Goldstein wrote: > On Fri, Oct 15, 2021 at 8:12 AM Adhemerval Zanella > <adhemerval.zanella@linaro.org> wrote: >> >> >> >> On 13/10/2021 00:29, Noah Goldstein wrote: >>> +static void >>> +swap_bytes (void * restrict a, void * restrict b, size_t n) >>> +{ >>> + /* Use multiple small memcpys with constant size to enable inlining >>> + on most targets. */ >>> + enum { SWAP_GENERIC_SIZE = 32 }; >>> + unsigned char tmp[SWAP_GENERIC_SIZE]; >>> + while (n > SWAP_GENERIC_SIZE) >>> + { >>> + memcpy (tmp, a, SWAP_GENERIC_SIZE); >>> + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; >>> + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; >>> + n -= SWAP_GENERIC_SIZE; >>> + } >>> + memcpy (tmp, a, n); >>> + memcpy (a, b, n); >>> + memcpy (b, tmp, n); >>> +} >>> + >>> +/* Replace the indirect call with a serie of if statements. It should help >>> + the branch predictor. */ >>> >>> >>> 1) Really? On Intel at least an indirect call that is always going to the same place >>> is certainly going to be predicted as well if not better than 2/3 branches + direct call. >>> >> >> I shamelessly copy the same strategy Linux kernel used on its lib/sort.c >> (8fb583c4258d08f0). Maybe Linux internal usage of its qsort() leads to >> better predictable branch, and for this change I would prefer to work >> better on different architectures than assume an specific one. > > If it's running in the Kernel spectre mitigations can make indirect jumps > particularly expensive. I don't think we have the same issue targeting > userland. Indeed I didn't take this in consideration, and it does not seems to help much in glibc case. I will used indirect calls.
On 15/10/2021 14:17, Noah Goldstein wrote: > On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella > <adhemerval.zanella@linaro.org> wrote: >> >> >> >> On 13/10/2021 00:39, Noah Goldstein wrote: >>> >>> >>> On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote: >>> >>> >>> >>> On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote: >>> >>> It optimizes take in consideration both the most common elements are >>> either 32 or 64 bit in size [1] and inputs are aligned to the word >>> boundary. This is similar to the optimization done on lib/sort.c >>> from Linux. >>> >>> This patchs adds an optimized swap operation on qsort based in previous >>> msort one. Instead of byte operation, three variants are provided: >>> >>> 1. Using uint32_t loads and stores. >>> 2. Using uint64_t loads and stores. >>> 3. Generic one with a temporary buffer and memcpy/mempcpy. >>> >>> The 1. and 2. options are selected only either if architecture defines >>> _STRING_ARCH_unaligned or if base pointer is aligned to required type. >>> >>> It also fixes BZ#19305 by checking input size against number of >>> elements 1 besides 0. >>> >>> Checked on x86_64-linux-gnu. >>> >>> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html> >>> --- >>> stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++-------- >>> 1 file changed, 91 insertions(+), 18 deletions(-) >>> >>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c >>> index 23f2d28314..59458d151b 100644 >>> --- a/stdlib/qsort.c >>> +++ b/stdlib/qsort.c >>> @@ -24,20 +24,85 @@ >>> #include <limits.h> >>> #include <stdlib.h> >>> #include <string.h> >>> +#include <stdbool.h> >>> >>> -/* Byte-wise swap two items of size SIZE. */ >>> -#define SWAP(a, b, size) \ >>> - do \ >>> - { \ >>> - size_t __size = (size); \ >>> - char *__a = (a), *__b = (b); \ >>> - do \ >>> - { \ >>> - char __tmp = *__a; \ >>> - *__a++ = *__b; \ >>> - *__b++ = __tmp; \ >>> - } while (--__size > 0); \ >>> - } while (0) >>> +/* Swap SIZE bytes between addresses A and B. These helpers are provided >>> + along the generic one as an optimization. */ >>> + >>> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); >>> + >>> +/* Return trues is elements can be copied used word load and sortes. >>> + The size must be a multiple of the alignment, and the base address. */ >>> +static inline bool >>> +is_aligned_to_copy (const void *base, size_t size, size_t align) >>> +{ >>> + unsigned char lsbits = size; >>> +#if !_STRING_ARCH_unaligned >>> + lsbits |= (unsigned char)(uintptr_t) base; >>> +#endif >>> + return (lsbits & (align - 1)) == 0; >>> +} >>> + >>> +#define SWAP_WORDS_64 (swap_func_t)0 >>> +#define SWAP_WORDS_32 (swap_func_t)1 >>> +#define SWAP_BYTES (swap_func_t)2 >>> + >>> +static void >>> +swap_words_64 (void * restrict a, void * restrict b, size_t n) >>> +{ >>> + do >>> + { >>> + n -= 8; >>> + uint64_t t = *(uint64_t *)(a + n); >>> + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); >>> + *(uint64_t *)(b + n) = t; >>> + } while (n); >>> +} >>> + >>> +static void >>> +swap_words_32 (void * restrict a, void * restrict b, size_t n) >>> +{ >>> + do >>> + { >>> + n -= 4; >>> + uint32_t t = *(uint32_t *)(a + n); >>> + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); >>> + *(uint32_t *)(b + n) = t; >>> + } while (n); >>> +} >>> >>> >>> I'm not certain swap_words_32 / swap_words_8 will be optimal for larger >>> key sizes. Looking at GCC's implementation of swap_generic on modern x86_64: >>> https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va> >>> It's able to optimize the temporary buffer out of the loop and use xmm registers which >>> will likely win out for larger sizes. >> >> It is probably not the most optimized code compiler can generated since >> I tried to go for a more generic code that should results in a somewhat >> better code in a architecture agnostic code. I trying to mimic some >> of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I >> used the same strategy used on d3496c9f4f27d (Linux did not optimize >> anything for the byte version). >> >> I think it would be possible to tune it for an specific architecture >> and/or compiler, but I would prefer to use a good enough algorithm that >> work reasonable on multiple architectures. > > That's fair. Although I still think there are some improvements. > > Looking at the assembly for all three in fact it seems GCC optimizes all of them > to larger register copies: https://godbolt.org/z/bd9nnnoEY > > The biggest difference seems to be the setup / register spills for > the generic version so for the common case of a relatively small key > the special case for 4/8 makes sense. > > Have you checked that GCC is able to use the conditions for selecting > the swap function to optimize the functions themselves? In the godbolt > link I got reasonable value out of adding the invariants to swap_64/swap_32. Not yet, but I think the generated code for both x86_64 and aarch64 seems simple enough and should cover the common case (keys with size 4 or 8) fast enough [1]. And since for v4 my plan is not to remove mergesort anymore, but limit it to a static buffer; it might be possible to use the same swap routines for both mergesort and quicksort. > > It also may be worth it to write a custom memcpy implementation for > size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably > more optimized than what generic memcpy can get away with). > I *really* do not want to go for this way. My hope is with small sizes compiler can inline the memcpy (which gcc does for most common architectures). [1] https://godbolt.org/z/v7e4xxqGa
On Fri, Oct 15, 2021 at 12:34 PM Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote: > > > > On 15/10/2021 14:17, Noah Goldstein wrote: > > On Fri, Oct 15, 2021 at 8:29 AM Adhemerval Zanella > > <adhemerval.zanella@linaro.org> wrote: > >> > >> > >> > >> On 13/10/2021 00:39, Noah Goldstein wrote: > >>> > >>> > >>> On Tue, Oct 12, 2021 at 11:29 PM Noah Goldstein <goldstein.w.n@gmail.com <mailto:goldstein.w.n@gmail.com>> wrote: > >>> > >>> > >>> > >>> On Fri, Sep 3, 2021 at 1:14 PM Adhemerval Zanella via Libc-alpha <libc-alpha@sourceware.org <mailto:libc-alpha@sourceware.org>> wrote: > >>> > >>> It optimizes take in consideration both the most common elements are > >>> either 32 or 64 bit in size [1] and inputs are aligned to the word > >>> boundary. This is similar to the optimization done on lib/sort.c > >>> from Linux. > >>> > >>> This patchs adds an optimized swap operation on qsort based in previous > >>> msort one. Instead of byte operation, three variants are provided: > >>> > >>> 1. Using uint32_t loads and stores. > >>> 2. Using uint64_t loads and stores. > >>> 3. Generic one with a temporary buffer and memcpy/mempcpy. > >>> > >>> The 1. and 2. options are selected only either if architecture defines > >>> _STRING_ARCH_unaligned or if base pointer is aligned to required type. > >>> > >>> It also fixes BZ#19305 by checking input size against number of > >>> elements 1 besides 0. > >>> > >>> Checked on x86_64-linux-gnu. > >>> > >>> [1] https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html <https://sourceware.org/pipermail/libc-alpha/2018-August/096984.html> > >>> --- > >>> stdlib/qsort.c | 109 +++++++++++++++++++++++++++++++++++++++++-------- > >>> 1 file changed, 91 insertions(+), 18 deletions(-) > >>> > >>> diff --git a/stdlib/qsort.c b/stdlib/qsort.c > >>> index 23f2d28314..59458d151b 100644 > >>> --- a/stdlib/qsort.c > >>> +++ b/stdlib/qsort.c > >>> @@ -24,20 +24,85 @@ > >>> #include <limits.h> > >>> #include <stdlib.h> > >>> #include <string.h> > >>> +#include <stdbool.h> > >>> > >>> -/* Byte-wise swap two items of size SIZE. */ > >>> -#define SWAP(a, b, size) \ > >>> - do \ > >>> - { \ > >>> - size_t __size = (size); \ > >>> - char *__a = (a), *__b = (b); \ > >>> - do \ > >>> - { \ > >>> - char __tmp = *__a; \ > >>> - *__a++ = *__b; \ > >>> - *__b++ = __tmp; \ > >>> - } while (--__size > 0); \ > >>> - } while (0) > >>> +/* Swap SIZE bytes between addresses A and B. These helpers are provided > >>> + along the generic one as an optimization. */ > >>> + > >>> +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); > >>> + > >>> +/* Return trues is elements can be copied used word load and sortes. > >>> + The size must be a multiple of the alignment, and the base address. */ > >>> +static inline bool > >>> +is_aligned_to_copy (const void *base, size_t size, size_t align) > >>> +{ > >>> + unsigned char lsbits = size; > >>> +#if !_STRING_ARCH_unaligned > >>> + lsbits |= (unsigned char)(uintptr_t) base; > >>> +#endif > >>> + return (lsbits & (align - 1)) == 0; > >>> +} > >>> + > >>> +#define SWAP_WORDS_64 (swap_func_t)0 > >>> +#define SWAP_WORDS_32 (swap_func_t)1 > >>> +#define SWAP_BYTES (swap_func_t)2 > >>> + > >>> +static void > >>> +swap_words_64 (void * restrict a, void * restrict b, size_t n) > >>> +{ > >>> + do > >>> + { > >>> + n -= 8; > >>> + uint64_t t = *(uint64_t *)(a + n); > >>> + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); > >>> + *(uint64_t *)(b + n) = t; > >>> + } while (n); > >>> +} > >>> + > >>> +static void > >>> +swap_words_32 (void * restrict a, void * restrict b, size_t n) > >>> +{ > >>> + do > >>> + { > >>> + n -= 4; > >>> + uint32_t t = *(uint32_t *)(a + n); > >>> + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); > >>> + *(uint32_t *)(b + n) = t; > >>> + } while (n); > >>> +} > >>> > >>> > >>> I'm not certain swap_words_32 / swap_words_8 will be optimal for larger > >>> key sizes. Looking at GCC's implementation of swap_generic on modern x86_64: > >>> https://godbolt.org/z/638h3Y9va <https://godbolt.org/z/638h3Y9va> > >>> It's able to optimize the temporary buffer out of the loop and use xmm registers which > >>> will likely win out for larger sizes. > >> > >> It is probably not the most optimized code compiler can generated since > >> I tried to go for a more generic code that should results in a somewhat > >> better code in a architecture agnostic code. I trying to mimic some > >> of optimization Linux did on 37d0ec34d111acfdb, and the swap_bytes I > >> used the same strategy used on d3496c9f4f27d (Linux did not optimize > >> anything for the byte version). > >> > >> I think it would be possible to tune it for an specific architecture > >> and/or compiler, but I would prefer to use a good enough algorithm that > >> work reasonable on multiple architectures. > > > > That's fair. Although I still think there are some improvements. > > > > Looking at the assembly for all three in fact it seems GCC optimizes all of them > > to larger register copies: https://godbolt.org/z/bd9nnnoEY > > > > The biggest difference seems to be the setup / register spills for > > the generic version so for the common case of a relatively small key > > the special case for 4/8 makes sense. > > > > Have you checked that GCC is able to use the conditions for selecting > > the swap function to optimize the functions themselves? In the godbolt > > link I got reasonable value out of adding the invariants to swap_64/swap_32. > > Not yet, but I think the generated code for both x86_64 and aarch64 seems > simple enough and should cover the common case (keys with size 4 or 8) > fast enough [1]. swap is in the inner loop. Seems like a pretty critical component to have fully optimized. The aarch64 version looks good, but the x86_64 version seems to be lacking. Not arguing for an arch specific version, but if the directives can add value to x86_64 without detracting from aarch64 seems like a zero cost improvement. > > And since for v4 my plan is not to remove mergesort anymore, but limit it > to a static buffer; it might be possible to use the same swap routines > for both mergesort and quicksort. > > > > > It also may be worth it to write a custom memcpy implementation for > > size = 0..SWAP_GENERIC_SIZE so it can be inlined (and probably > > more optimized than what generic memcpy can get away with). > > > > I *really* do not want to go for this way. My hope is with small sizes > compiler can inline the memcpy (which gcc does for most common > architectures). Since size is non-constant for the tail I don't see how we are going avoid 3x memcpy calls. Although that can be another patch if it gets values. > > > [1] https://godbolt.org/z/v7e4xxqGa
On 15/10/2021 14:45, Noah Goldstein wrote: > swap is in the inner loop. Seems like a pretty critical component to have fully > optimized. The aarch64 version looks good, but the x86_64 version seems > to be lacking. Not arguing for an arch specific version, but if the directives > can add value to x86_64 without detracting from aarch64 seems like a zero > cost improvement. > > Since size is non-constant for the tail I don't see how we are going > avoid 3x memcpy calls. Although that can be another patch if it > gets values. > >> >> >> [1] https://godbolt.org/z/v7e4xxqGa Maybe use a byte copy in the tail to avoid memcpy [1], another option might to tune SWAP_GENERIC_SIZE to make the tail less costly (16 should be ok for most architecture, although some might be better with large values). [1] https://godbolt.org/z/G76dcej16
diff --git a/stdlib/qsort.c b/stdlib/qsort.c index 23f2d28314..59458d151b 100644 --- a/stdlib/qsort.c +++ b/stdlib/qsort.c @@ -24,20 +24,85 @@ #include <limits.h> #include <stdlib.h> #include <string.h> +#include <stdbool.h> -/* Byte-wise swap two items of size SIZE. */ -#define SWAP(a, b, size) \ - do \ - { \ - size_t __size = (size); \ - char *__a = (a), *__b = (b); \ - do \ - { \ - char __tmp = *__a; \ - *__a++ = *__b; \ - *__b++ = __tmp; \ - } while (--__size > 0); \ - } while (0) +/* Swap SIZE bytes between addresses A and B. These helpers are provided + along the generic one as an optimization. */ + +typedef void (*swap_func_t)(void * restrict, void * restrict, size_t); + +/* Return trues is elements can be copied used word load and sortes. + The size must be a multiple of the alignment, and the base address. */ +static inline bool +is_aligned_to_copy (const void *base, size_t size, size_t align) +{ + unsigned char lsbits = size; +#if !_STRING_ARCH_unaligned + lsbits |= (unsigned char)(uintptr_t) base; +#endif + return (lsbits & (align - 1)) == 0; +} + +#define SWAP_WORDS_64 (swap_func_t)0 +#define SWAP_WORDS_32 (swap_func_t)1 +#define SWAP_BYTES (swap_func_t)2 + +static void +swap_words_64 (void * restrict a, void * restrict b, size_t n) +{ + do + { + n -= 8; + uint64_t t = *(uint64_t *)(a + n); + *(uint64_t *)(a + n) = *(uint64_t *)(b + n); + *(uint64_t *)(b + n) = t; + } while (n); +} + +static void +swap_words_32 (void * restrict a, void * restrict b, size_t n) +{ + do + { + n -= 4; + uint32_t t = *(uint32_t *)(a + n); + *(uint32_t *)(a + n) = *(uint32_t *)(b + n); + *(uint32_t *)(b + n) = t; + } while (n); +} + +static void +swap_bytes (void * restrict a, void * restrict b, size_t n) +{ + /* Use multiple small memcpys with constant size to enable inlining + on most targets. */ + enum { SWAP_GENERIC_SIZE = 32 }; + unsigned char tmp[SWAP_GENERIC_SIZE]; + while (n > SWAP_GENERIC_SIZE) + { + memcpy (tmp, a, SWAP_GENERIC_SIZE); + a = memcpy (a, b, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; + b = memcpy (b, tmp, SWAP_GENERIC_SIZE) + SWAP_GENERIC_SIZE; + n -= SWAP_GENERIC_SIZE; + } + memcpy (tmp, a, n); + memcpy (a, b, n); + memcpy (b, tmp, n); +} + +/* Replace the indirect call with a serie of if statements. It should help + the branch predictor. */ +static void +do_swap (void * restrict a, void * restrict b, size_t size, + swap_func_t swap_func) +{ + if (swap_func == SWAP_WORDS_64) + swap_words_64 (a, b, size); + else if (swap_func == SWAP_WORDS_32) + swap_words_32 (a, b, size); + else + swap_bytes (a, b, size); +} /* Discontinue quicksort algorithm when partition gets below this size. This particular magic number was chosen to work best on a Sun 4/260. */ @@ -97,6 +162,14 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, /* Avoid lossage with unsigned arithmetic below. */ return; + swap_func_t swap_func; + if (is_aligned_to_copy (pbase, size, 8)) + swap_func = SWAP_WORDS_64; + else if (is_aligned_to_copy (pbase, size, 4)) + swap_func = SWAP_WORDS_32; + else + swap_func = SWAP_BYTES; + if (total_elems > MAX_THRESH) { char *lo = base_ptr; @@ -120,13 +193,13 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, char *mid = lo + size * ((hi - lo) / size >> 1); if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) - SWAP (mid, lo, size); + do_swap (mid, lo, size, swap_func); if ((*cmp) ((void *) hi, (void *) mid, arg) < 0) - SWAP (mid, hi, size); + do_swap (mid, hi, size, swap_func); else goto jump_over; if ((*cmp) ((void *) mid, (void *) lo, arg) < 0) - SWAP (mid, lo, size); + do_swap (mid, lo, size, swap_func); jump_over:; left_ptr = lo + size; @@ -145,7 +218,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, if (left_ptr < right_ptr) { - SWAP (left_ptr, right_ptr, size); + do_swap (left_ptr, right_ptr, size, swap_func); if (mid == left_ptr) mid = right_ptr; else if (mid == right_ptr) @@ -217,7 +290,7 @@ _quicksort (void *const pbase, size_t total_elems, size_t size, tmp_ptr = run_ptr; if (tmp_ptr != base_ptr) - SWAP (tmp_ptr, base_ptr, size); + do_swap (tmp_ptr, base_ptr, size, swap_func); /* Insertion sort, running from left-hand-side up to right-hand-side. */