Message ID | xncze8e45v.fsf@greed.delorie.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/1] memalign: Support scanning for aligned chunks. | expand |
On 7/13/22 23:58, DJ Delorie via Libc-alpha wrote: > This patch adds a chunk scanning algorithm to the _int_memalign code > path that reduces heap fragmentation by reusing already aligned chunks > instead of always looking for chunks of larger sizes and splitting > them. > > The goal is to fix the pathological use cases where heaps grow > continuously in workloads that are heavy users of memalign. Fails pre-commit CI. Please review :-) https://www.delorie.com/trybots/32bit/10936/ > diff --git a/malloc/Makefile b/malloc/Makefile > index 4e32de2a0b..7a25f2d781 100644 > --- a/malloc/Makefile > +++ b/malloc/Makefile > @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \ > tst-tcfree1 tst-tcfree2 tst-tcfree3 \ > tst-safe-linking \ > tst-mallocalign1 \ > + tst-memalign-2 > > tests-static := \ > tst-interpose-static-nothread \ > diff --git a/malloc/malloc.c b/malloc/malloc.c > index 12908b8f97..219707ebec 100644 > --- a/malloc/malloc.c > +++ b/malloc/malloc.c > @@ -3557,6 +3557,32 @@ _mid_memalign (size_t alignment, size_t bytes, void *address) > alignment = a; > } > > +#if USE_TCACHE > + { > + size_t tbytes; > + tbytes = checked_request2size (bytes); > + if (tbytes == 0) > + { > + __set_errno (ENOMEM); > + return NULL; > + } > + size_t tc_idx = csize2tidx (tbytes); > + > + MAYBE_INIT_TCACHE (); > + > + if (tc_idx < mp_.tcache_bins > + && tcache > + && tcache->counts[tc_idx] > 0 > + && ((intptr_t)tcache->entries[tc_idx] & (alignment - 1)) == 0) > + { > + void *victim = tcache_get (tc_idx); > + if (__glibc_unlikely (misaligned_chunk (victim))) > + malloc_printerr ("_mid_memalign(): unaligned tcache chunk detected"); > + return tag_new_usable (victim); > + } > + } > +#endif > + > if (SINGLE_THREAD_P) > { > p = _int_memalign (&main_arena, alignment, bytes); > @@ -4937,6 +4963,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize, > ------------------------------ memalign ------------------------------ > */ > > +/* Returns 0 if the chunk is not and does not contain the requested > + aligned sub-chunk, else returns the amount of "waste" from > + trimming. BYTES is the *user* byte size, not the chunk byte > + size. */ > +static int > +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes) > +{ > + void *m = chunk2mem (p); > + INTERNAL_SIZE_T size = memsize (p); > + void *aligned_m = m; > + > + if (__glibc_unlikely (misaligned_chunk (p))) > + malloc_printerr ("_int_memalign(): unaligned chunk detected"); > + > + aligned_m = PTR_ALIGN_UP (m, alignment); > + > + INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m; > + > + /* We can't trim off the front as it's too small. */ > + if (front_extra > 0 && front_extra < MINSIZE) > + return 0; > + > + /* If it's a perfect fit, it's an exception to the return value rule > + (we would return zero waste, which looks like "not usable"), so > + handle it here by returning a small non-zero value instead. */ > + if (size == bytes && front_extra == 0) > + return 1; > + > + /* If the block we need fits in the chunk, calculate total waste. */ > + if (size > bytes + front_extra) > + return size - bytes; > + > + /* Can't use this chunk. */ > + return 0; > +} > + > +/* BYTES is user requested bytes, not requested chunksize bytes. */ > static void * > _int_memalign (mstate av, size_t alignment, size_t bytes) > { > @@ -4950,8 +5013,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) > mchunkptr remainder; /* spare room at end to split off */ > unsigned long remainder_size; /* its size */ > INTERNAL_SIZE_T size; > - > - > + mchunkptr victim; > > nb = checked_request2size (bytes); > if (nb == 0) > @@ -4960,29 +5022,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) > return NULL; > } > > - /* > - Strategy: find a spot within that chunk that meets the alignment > + /* We can't check tcache here because we hold the arena lock, which > + tcache doesn't expect. We expect it has been checked > + earlier. */ > + > + /* Strategy: search the bins looking for an existing block that > + meets our needs. We scan a range of bins from "exact size" to > + "just under 2x", spanning the small/large barrier if needed. If > + we don't find anything in those bins, the common malloc code will > + scan starting at 2x. */ > + > + /* This will be set if we found a candidate chunk. */ > + victim = NULL; > + > + /* Fast bins are singly-linked, hard to remove a chunk from the middle > + and unlikely to meet our alignment requirements. We have not done > + any experimentation with searching for aligned fastbins. */ > + > + int first_bin_index; > + int first_largebin_index; > + int last_bin_index; > + > + if (in_smallbin_range (nb)) > + first_bin_index = smallbin_index (nb); > + else > + first_bin_index = largebin_index (nb); > + > + if (in_smallbin_range (nb * 2)) > + last_bin_index = smallbin_index (nb * 2); > + else > + last_bin_index = largebin_index (nb * 2); > + > + first_largebin_index = largebin_index (MIN_LARGE_SIZE); > + > + int victim_index; /* its bin index */ > + > + for (victim_index = first_bin_index; > + victim_index < last_bin_index; > + victim_index ++) > + { > + victim = NULL; > + > + if (victim_index < first_largebin_index) > + { > + /* Check small bins. Small bin chunks are doubly-linked despite > + being the same size. */ > + > + mchunkptr fwd; /* misc temp for linking */ > + mchunkptr bck; /* misc temp for linking */ > + > + bck = bin_at (av, victim_index); > + fwd = bck->fd; > + while (fwd != bck) > + { > + if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0) > + { > + victim = fwd; > + > + /* Unlink it */ > + victim->fd->bk = victim->bk; > + victim->bk->fd = victim->fd; > + break; > + } > + > + fwd = fwd->fd; > + } > + } > + else > + { > + /* Check large bins. */ > + mchunkptr fwd; /* misc temp for linking */ > + mchunkptr bck; /* misc temp for linking */ > + mchunkptr best = NULL; > + size_t best_size = 0; > + > + bck = bin_at (av, victim_index); > + fwd = bck->fd; > + > + while (fwd != bck) > + { > + int extra; > + > + if (chunksize (fwd) < nb) > + break; > + extra = chunk_ok_for_memalign (fwd, alignment, bytes); > + if (extra > 0 > + && (extra <= best_size || best == NULL)) > + { > + best = fwd; > + best_size = extra; > + } > + > + fwd = fwd->fd; > + } > + victim = best; > + > + if (victim != NULL) > + { > + unlink_chunk (av, victim); > + break; > + } > + } > + > + if (victim != NULL) > + break; > + } > + > + /* Strategy: find a spot within that chunk that meets the alignment > request, and then possibly free the leading and trailing space. > - */ > + This strategy is incredibly costly and can lead to external > + fragmentation if header and footer chunks are unused. */ > > - /* Call malloc with worst case padding to hit alignment. */ > + if (victim != NULL) > + { > + p = victim; > + m = chunk2mem (p); > + set_inuse (p); > + } > + else > + { > + /* Call malloc with worst case padding to hit alignment. */ > > - m = (char *) (_int_malloc (av, nb + alignment + MINSIZE)); > + m = (char *) (_int_malloc (av, nb + alignment + MINSIZE)); > > - if (m == 0) > - return 0; /* propagate failure */ > + if (m == 0) > + return 0; /* propagate failure */ > > - p = mem2chunk (m); > + p = mem2chunk (m); > + } > > if ((((unsigned long) (m)) % alignment) != 0) /* misaligned */ > - > - { /* > - Find an aligned spot inside chunk. Since we need to give back > - leading space in a chunk of at least MINSIZE, if the first > - calculation places us at a spot with less than MINSIZE leader, > - we can move to the next aligned spot -- we've allocated enough > - total room so that this is always possible. > - */ > + { > + /* Find an aligned spot inside chunk. Since we need to give back > + leading space in a chunk of at least MINSIZE, if the first > + calculation places us at a spot with less than MINSIZE leader, > + we can move to the next aligned spot -- we've allocated enough > + total room so that this is always possible. */ > brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) & > - ((signed long) alignment)); > if ((unsigned long) (brk - (char *) (p)) < MINSIZE) > diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c > new file mode 100644 > index 0000000000..04d42a2da2 > --- /dev/null > +++ b/malloc/tst-memalign-2.c > @@ -0,0 +1,115 @@ > +/* Test for memalign chunk reuse > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <errno.h> > +#include <malloc.h> > +#include <stdio.h> > +#include <string.h> > +#include <unistd.h> > +#include <array_length.h> > + > +#include <support/check.h> > + > +typedef struct TestCase { > + size_t size; > + size_t alignment; > + void *ptr1; > + void *ptr2; > +} TestCase; > + > +static TestCase tcache_allocs[] = { > + { 24, 8, NULL, NULL }, > + { 24, 16, NULL, NULL }, > + { 128, 32, NULL, NULL } > +}; > +#define TN array_length (tcache_allocs) > + > +static TestCase large_allocs[] = { > + { 23450, 64, NULL, NULL }, > + { 23450, 64, NULL, NULL }, > + { 23550, 64, NULL, NULL }, > + { 23550, 64, NULL, NULL }, > + { 23650, 64, NULL, NULL }, > + { 23650, 64, NULL, NULL }, > + { 33650, 64, NULL, NULL }, > + { 33650, 64, NULL, NULL } > +}; > +#define LN array_length (large_allocs) > + > +void *p; > + > +static int > +do_test (void) > +{ > + int i, j; > + int count; > + > + /* TCache test. */ > + > + for (i = 0; i < TN; ++ i) > + { > + tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size); > + free (tcache_allocs[i].ptr1); > + /* This should return the same chunk as was just free'd. */ > + tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size); > + free (tcache_allocs[i].ptr2); > + > + TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2); > + } > + > + /* Large bins test. */ > + > + for (i = 0; i < LN; ++ i) > + { > + large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size); > + /* Keep chunks from combining by fragmenting the heap. */ > + p = malloc (512); > + } > + > + for (i = 0; i < LN; ++ i) > + free (large_allocs[i].ptr1); > + > + /* Force the unsorted bins to be scanned and moved to small/large > + bins. */ > + p = malloc (60000); > + > + for (i = 0; i < LN; ++ i) > + large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size); > + > + count = 0; > + for (i = 0; i < LN; ++ i) > + { > + int ok = 0; > + for (j = 0; j < LN; ++ j) > + if (large_allocs[i].ptr1 == large_allocs[j].ptr2) > + ok = 1; > + if (ok == 1) > + count ++; > + } > + > + /* The allocation algorithm is complicated outside of the memalign > + logic, so just make sure it's working for most of the > + allocations. This avoids possible boundary conditions with > + empty/full heaps. */ > + TEST_VERIFY (count > LN / 2); > + > + return 0; > +} > + > +#define TEST_FUNCTION do_test () > +#include "../test-skeleton.c" >
diff --git a/malloc/Makefile b/malloc/Makefile index 4e32de2a0b..7a25f2d781 100644 --- a/malloc/Makefile +++ b/malloc/Makefile @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \ tst-tcfree1 tst-tcfree2 tst-tcfree3 \ tst-safe-linking \ tst-mallocalign1 \ + tst-memalign-2 tests-static := \ tst-interpose-static-nothread \ diff --git a/malloc/malloc.c b/malloc/malloc.c index 12908b8f97..219707ebec 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -3557,6 +3557,32 @@ _mid_memalign (size_t alignment, size_t bytes, void *address) alignment = a; } +#if USE_TCACHE + { + size_t tbytes; + tbytes = checked_request2size (bytes); + if (tbytes == 0) + { + __set_errno (ENOMEM); + return NULL; + } + size_t tc_idx = csize2tidx (tbytes); + + MAYBE_INIT_TCACHE (); + + if (tc_idx < mp_.tcache_bins + && tcache + && tcache->counts[tc_idx] > 0 + && ((intptr_t)tcache->entries[tc_idx] & (alignment - 1)) == 0) + { + void *victim = tcache_get (tc_idx); + if (__glibc_unlikely (misaligned_chunk (victim))) + malloc_printerr ("_mid_memalign(): unaligned tcache chunk detected"); + return tag_new_usable (victim); + } + } +#endif + if (SINGLE_THREAD_P) { p = _int_memalign (&main_arena, alignment, bytes); @@ -4937,6 +4963,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize, ------------------------------ memalign ------------------------------ */ +/* Returns 0 if the chunk is not and does not contain the requested + aligned sub-chunk, else returns the amount of "waste" from + trimming. BYTES is the *user* byte size, not the chunk byte + size. */ +static int +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes) +{ + void *m = chunk2mem (p); + INTERNAL_SIZE_T size = memsize (p); + void *aligned_m = m; + + if (__glibc_unlikely (misaligned_chunk (p))) + malloc_printerr ("_int_memalign(): unaligned chunk detected"); + + aligned_m = PTR_ALIGN_UP (m, alignment); + + INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m; + + /* We can't trim off the front as it's too small. */ + if (front_extra > 0 && front_extra < MINSIZE) + return 0; + + /* If it's a perfect fit, it's an exception to the return value rule + (we would return zero waste, which looks like "not usable"), so + handle it here by returning a small non-zero value instead. */ + if (size == bytes && front_extra == 0) + return 1; + + /* If the block we need fits in the chunk, calculate total waste. */ + if (size > bytes + front_extra) + return size - bytes; + + /* Can't use this chunk. */ + return 0; +} + +/* BYTES is user requested bytes, not requested chunksize bytes. */ static void * _int_memalign (mstate av, size_t alignment, size_t bytes) { @@ -4950,8 +5013,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) mchunkptr remainder; /* spare room at end to split off */ unsigned long remainder_size; /* its size */ INTERNAL_SIZE_T size; - - + mchunkptr victim; nb = checked_request2size (bytes); if (nb == 0) @@ -4960,29 +5022,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes) return NULL; } - /* - Strategy: find a spot within that chunk that meets the alignment + /* We can't check tcache here because we hold the arena lock, which + tcache doesn't expect. We expect it has been checked + earlier. */ + + /* Strategy: search the bins looking for an existing block that + meets our needs. We scan a range of bins from "exact size" to + "just under 2x", spanning the small/large barrier if needed. If + we don't find anything in those bins, the common malloc code will + scan starting at 2x. */ + + /* This will be set if we found a candidate chunk. */ + victim = NULL; + + /* Fast bins are singly-linked, hard to remove a chunk from the middle + and unlikely to meet our alignment requirements. We have not done + any experimentation with searching for aligned fastbins. */ + + int first_bin_index; + int first_largebin_index; + int last_bin_index; + + if (in_smallbin_range (nb)) + first_bin_index = smallbin_index (nb); + else + first_bin_index = largebin_index (nb); + + if (in_smallbin_range (nb * 2)) + last_bin_index = smallbin_index (nb * 2); + else + last_bin_index = largebin_index (nb * 2); + + first_largebin_index = largebin_index (MIN_LARGE_SIZE); + + int victim_index; /* its bin index */ + + for (victim_index = first_bin_index; + victim_index < last_bin_index; + victim_index ++) + { + victim = NULL; + + if (victim_index < first_largebin_index) + { + /* Check small bins. Small bin chunks are doubly-linked despite + being the same size. */ + + mchunkptr fwd; /* misc temp for linking */ + mchunkptr bck; /* misc temp for linking */ + + bck = bin_at (av, victim_index); + fwd = bck->fd; + while (fwd != bck) + { + if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0) + { + victim = fwd; + + /* Unlink it */ + victim->fd->bk = victim->bk; + victim->bk->fd = victim->fd; + break; + } + + fwd = fwd->fd; + } + } + else + { + /* Check large bins. */ + mchunkptr fwd; /* misc temp for linking */ + mchunkptr bck; /* misc temp for linking */ + mchunkptr best = NULL; + size_t best_size = 0; + + bck = bin_at (av, victim_index); + fwd = bck->fd; + + while (fwd != bck) + { + int extra; + + if (chunksize (fwd) < nb) + break; + extra = chunk_ok_for_memalign (fwd, alignment, bytes); + if (extra > 0 + && (extra <= best_size || best == NULL)) + { + best = fwd; + best_size = extra; + } + + fwd = fwd->fd; + } + victim = best; + + if (victim != NULL) + { + unlink_chunk (av, victim); + break; + } + } + + if (victim != NULL) + break; + } + + /* Strategy: find a spot within that chunk that meets the alignment request, and then possibly free the leading and trailing space. - */ + This strategy is incredibly costly and can lead to external + fragmentation if header and footer chunks are unused. */ - /* Call malloc with worst case padding to hit alignment. */ + if (victim != NULL) + { + p = victim; + m = chunk2mem (p); + set_inuse (p); + } + else + { + /* Call malloc with worst case padding to hit alignment. */ - m = (char *) (_int_malloc (av, nb + alignment + MINSIZE)); + m = (char *) (_int_malloc (av, nb + alignment + MINSIZE)); - if (m == 0) - return 0; /* propagate failure */ + if (m == 0) + return 0; /* propagate failure */ - p = mem2chunk (m); + p = mem2chunk (m); + } if ((((unsigned long) (m)) % alignment) != 0) /* misaligned */ - - { /* - Find an aligned spot inside chunk. Since we need to give back - leading space in a chunk of at least MINSIZE, if the first - calculation places us at a spot with less than MINSIZE leader, - we can move to the next aligned spot -- we've allocated enough - total room so that this is always possible. - */ + { + /* Find an aligned spot inside chunk. Since we need to give back + leading space in a chunk of at least MINSIZE, if the first + calculation places us at a spot with less than MINSIZE leader, + we can move to the next aligned spot -- we've allocated enough + total room so that this is always possible. */ brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) & - ((signed long) alignment)); if ((unsigned long) (brk - (char *) (p)) < MINSIZE) diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c new file mode 100644 index 0000000000..04d42a2da2 --- /dev/null +++ b/malloc/tst-memalign-2.c @@ -0,0 +1,115 @@ +/* Test for memalign chunk reuse + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <errno.h> +#include <malloc.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <array_length.h> + +#include <support/check.h> + +typedef struct TestCase { + size_t size; + size_t alignment; + void *ptr1; + void *ptr2; +} TestCase; + +static TestCase tcache_allocs[] = { + { 24, 8, NULL, NULL }, + { 24, 16, NULL, NULL }, + { 128, 32, NULL, NULL } +}; +#define TN array_length (tcache_allocs) + +static TestCase large_allocs[] = { + { 23450, 64, NULL, NULL }, + { 23450, 64, NULL, NULL }, + { 23550, 64, NULL, NULL }, + { 23550, 64, NULL, NULL }, + { 23650, 64, NULL, NULL }, + { 23650, 64, NULL, NULL }, + { 33650, 64, NULL, NULL }, + { 33650, 64, NULL, NULL } +}; +#define LN array_length (large_allocs) + +void *p; + +static int +do_test (void) +{ + int i, j; + int count; + + /* TCache test. */ + + for (i = 0; i < TN; ++ i) + { + tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size); + free (tcache_allocs[i].ptr1); + /* This should return the same chunk as was just free'd. */ + tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size); + free (tcache_allocs[i].ptr2); + + TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2); + } + + /* Large bins test. */ + + for (i = 0; i < LN; ++ i) + { + large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size); + /* Keep chunks from combining by fragmenting the heap. */ + p = malloc (512); + } + + for (i = 0; i < LN; ++ i) + free (large_allocs[i].ptr1); + + /* Force the unsorted bins to be scanned and moved to small/large + bins. */ + p = malloc (60000); + + for (i = 0; i < LN; ++ i) + large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size); + + count = 0; + for (i = 0; i < LN; ++ i) + { + int ok = 0; + for (j = 0; j < LN; ++ j) + if (large_allocs[i].ptr1 == large_allocs[j].ptr2) + ok = 1; + if (ok == 1) + count ++; + } + + /* The allocation algorithm is complicated outside of the memalign + logic, so just make sure it's working for most of the + allocations. This avoids possible boundary conditions with + empty/full heaps. */ + TEST_VERIFY (count > LN / 2); + + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c"