Message ID | xna88tzpuu.fsf@greed.delorie.com |
---|---|
State | New |
Headers | show |
On Fri, Mar 10, 2017 at 5:02 AM, DJ Delorie <dj@redhat.com> wrote: > > Branch updated... > > Fix whitespace around operators. > > Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper > > Add helper functions for tcache_get() and tcache_put() to collect > common code in one spot. > > Updated patch attached. (git diff origin/master...origin/dj/malloc-tcache) This looks ok to me from a basic review - the code looks reasonable, no testsuite regressions or warnings added. It could do with a ChangeLog and NEWS entry though. It would be good to get wider testing of the performance on performance critical workloads that other people may have. > I also put a copy of the oocalc workload online, as a sample of the > workloads I've been using to benchmark. It's 16M compressed, and you > have to decompress it (to 65M) to use it with the simulator. > > http://www.delorie.com/tmp/oocalc.wl.bz2 > > (if you use wget, use "-U malloc" ;) > > diff --git a/config.make.in b/config.make.in > index 5836b32..0290d83 100644 > --- a/config.make.in > +++ b/config.make.in > @@ -77,6 +77,8 @@ multi-arch = @multi_arch@ > > mach-interface-list = @mach_interface_list@ > > +experimental-malloc = @experimental_malloc@ > + > nss-crypt = @libc_cv_nss_crypt@ > static-nss-crypt = @libc_cv_static_nss_crypt@ > > diff --git a/configure.ac b/configure.ac > index 4a77411..b929012 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch], > [multi_arch=$enableval], > [multi_arch=default]) > > +AC_ARG_ENABLE([experimental-malloc], > + AC_HELP_STRING([--disable-experimental-malloc], > + [disable experimental malloc features]), > + [experimental_malloc=$enableval], > + [experimental_malloc=yes]) > +AC_SUBST(experimental_malloc) > + > AC_ARG_ENABLE([nss-crypt], > AC_HELP_STRING([--enable-nss-crypt], > [enable libcrypt to use nss]), > diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list > index cb9e8f1..37620c8 100644 > --- a/elf/dl-tunables.list > +++ b/elf/dl-tunables.list > @@ -76,5 +76,17 @@ glibc { > minval: 1 > security_level: SXID_IGNORE > } > + tcache_max { > + type: SIZE_T > + env_alias: MALLOC_TCACHE_MAX > + } > + tcache_count { > + type: SIZE_T > + env_alias: MALLOC_TCACHE_COUNT > + } > + tcache_unsorted_limit { > + type: SIZE_T > + env_alias: MALLOC_TCACHE_UNSORTED_LIMIT > + } > } > } > diff --git a/malloc/Makefile b/malloc/Makefile > index e93b83b..dd8a43a 100644 > --- a/malloc/Makefile > +++ b/malloc/Makefile > @@ -168,6 +168,9 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV) > tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3 > tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV) > > +ifeq ($(experimental-malloc),yes) > +CPPFLAGS-malloc.c += -DUSE_TCACHE > +endif > # Uncomment this for test releases. For public releases it is too expensive. > #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1 > > diff --git a/malloc/arena.c b/malloc/arena.c > index d49e4a2..79e918f 100644 > --- a/malloc/arena.c > +++ b/malloc/arena.c > @@ -236,6 +236,11 @@ DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t) > DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t) > DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t) > DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t) > +#if USE_TCACHE > +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t) > +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t) > +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t) > +#endif > #else > /* Initialization routine. */ > #include <string.h> > @@ -322,6 +327,11 @@ ptmalloc_init (void) > TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max); > TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max); > TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test); > +#if USE_TCACHE > + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max); > + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count); > + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit); > +#endif > __libc_lock_unlock (main_arena.mutex); > #else > const char *s = NULL; > @@ -371,7 +381,23 @@ ptmalloc_init (void) > if (memcmp (envline, "ARENA_TEST", 10) == 0) > __libc_mallopt (M_ARENA_TEST, atoi (&envline[11])); > } > +#if USE_TCACHE > + if (!__builtin_expect (__libc_enable_secure, 0)) > + { > + if (memcmp (envline, "TCACHE_MAX", 10) == 0) > + __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11])); > + } > +#endif > break; > +#if USE_TCACHE > + case 12: > + if (!__builtin_expect (__libc_enable_secure, 0)) > + { > + if (memcmp (envline, "TCACHE_COUNT", 12) == 0) > + __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13])); > + } > + break; > +#endif > case 15: > if (!__builtin_expect (__libc_enable_secure, 0)) > { > @@ -381,6 +407,15 @@ ptmalloc_init (void) > __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16])); > } > break; > +#if USE_TCACHE > + case 21: > + if (!__builtin_expect (__libc_enable_secure, 0)) > + { > + if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0) > + __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22])); > + } > + break; > +#endif > default: > break; > } > diff --git a/malloc/malloc.c b/malloc/malloc.c > index e29105c..8cd03d8 100644 > --- a/malloc/malloc.c > +++ b/malloc/malloc.c > @@ -297,6 +297,33 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line, > } > #endif > > +#ifndef USE_TCACHE > +# define USE_TCACHE 0 > +#endif > +#if USE_TCACHE > +/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */ > +# define MAX_TCACHE_SIZE (MALLOC_ALIGNMENT * 63) > +# define TCACHE_IDX ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1) > +# define size2tidx_(bytes) (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT) > + > +# define tidx2csize(idx) ((idx) * MALLOC_ALIGNMENT + SIZE_SZ) > +# define tidx2usize(idx) ((idx) * MALLOC_ALIGNMENT) > + > +/* When "x" is a user-provided size. */ > +# define usize2tidx(x) size2tidx_ (x) > +/* When "x" is from chunksize(). */ > +# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ) > + > +/* Rounds up, so... > + idx 0 bytes 0 > + idx 1 bytes 1..8 > + idx 2 bytes 9..16 > + etc. */ > + > +/* This is another arbitrary limit, which tunables can change. */ > +# define TCACHE_FILL_COUNT 7 > +#endif > + > > /* > REALLOC_ZERO_BYTES_FREES should be set if a call to > @@ -1711,6 +1738,17 @@ struct malloc_par > > /* First address handed out by MORECORE/sbrk. */ > char *sbrk_base; > + > +#if USE_TCACHE > + /* Maximum number of buckets to use. */ > + size_t tcache_max; > + size_t tcache_max_bytes; > + /* Maximum number of chunks in each bucket. */ > + size_t tcache_count; > + /* Maximum number of chunks to remove from the unsorted list, which > + don't match. */ > + size_t tcache_unsorted_limit; > +#endif > }; > > /* There are several instances of this struct ("arenas") in this > @@ -1749,8 +1787,22 @@ static struct malloc_par mp_ = > .trim_threshold = DEFAULT_TRIM_THRESHOLD, > #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8)) > .arena_test = NARENAS_FROM_NCORES (1) > +#if USE_TCACHE > + , > + .tcache_count = TCACHE_FILL_COUNT, > + .tcache_max = TCACHE_IDX, > + .tcache_max_bytes = tidx2usize (TCACHE_IDX-1), > + .tcache_unsorted_limit = 0 /* No limit */ > +#endif > }; > > +/* Non public mallopt parameters. */ > +#if USE_TCACHE > +# define M_TCACHE_COUNT -9 > +# define M_TCACHE_MAX -10 > +# define M_TCACHE_UNSORTED_LIMIT -11 > +#endif > + > /* Maximum size of memory handled in fastbins. */ > static INTERNAL_SIZE_T global_max_fast; > > @@ -2874,6 +2926,106 @@ mremap_chunk (mchunkptr p, size_t new_size) > > /*------------------------ Public wrappers. --------------------------------*/ > > +#if USE_TCACHE > + > +typedef struct TCacheEntry { > + struct TCacheEntry *next; > +} TCacheEntry; > + > +/* There is one of these for each thread, which contains the > + per-thread cache (hence "TCache"). Keeping overall size low is > + mildly important. Note that COUNTS and ENTRIES are redundant, this > + is for performance reasons. */ > +typedef struct TCache { > + char counts[TCACHE_IDX]; > + TCacheEntry *entries[TCACHE_IDX]; > +} TCache; > + > +static __thread char tcache_shutting_down = 0; > +static __thread TCache *tcache = NULL; > + > +static void > +tcache_put (mchunkptr chunk, size_t tc_idx) > +{ > + TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk); > + e->next = tcache->entries[tc_idx]; > + tcache->entries[tc_idx] = e; > + ++(tcache->counts[tc_idx]); > +} > + > +static void * > +tcache_get (size_t tc_idx) > +{ > + TCacheEntry *e = tcache->entries[tc_idx]; > + tcache->entries[tc_idx] = e->next; > + --(tcache->counts[tc_idx]); > + return (void *) e; > +} > + > +static void __attribute__ ((section ("__libc_thread_freeres_fn"))) > +tcache_thread_freeres (void) > +{ > + int i; > + TCache *tcache_tmp = tcache; > + > + if (!tcache) > + return; > + > + tcache = NULL; > + > + for (i = 0; i < TCACHE_IDX; ++i) { > + while (tcache_tmp->entries[i]) > + { > + TCacheEntry *e = tcache_tmp->entries[i]; > + tcache_tmp->entries[i] = e->next; > + __libc_free (e); > + } > + } > + > + __libc_free (tcache_tmp); > + > + tcache_shutting_down = 1; > +} > +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres); > + > +static void > +tcache_init(void) > +{ > + mstate ar_ptr; > + void *victim = 0; > + const size_t bytes = sizeof (TCache); > + > + if (tcache_shutting_down) > + return; > + > + arena_get (ar_ptr, bytes); > + victim = _int_malloc (ar_ptr, bytes); > + if (!victim && ar_ptr != NULL) > + { > + ar_ptr = arena_get_retry (ar_ptr, bytes); > + victim = _int_malloc (ar_ptr, bytes); > + } > + > + > + if (ar_ptr != NULL) > + __libc_lock_unlock (ar_ptr->mutex); > + > + if (victim) > + { > + tcache = (TCache *) victim; > + memset (tcache, 0, sizeof (TCache)); > + } > + > +} > + > +#define MAYBE_INIT_TCACHE() \ > + if (__glibc_unlikely (tcache == NULL)) \ > + tcache_init(); > + > +#else > +#define MAYBE_INIT_TCACHE() > +#endif > + > void * > __libc_malloc (size_t bytes) > { > @@ -2884,6 +3036,21 @@ __libc_malloc (size_t bytes) > = atomic_forced_read (__malloc_hook); > if (__builtin_expect (hook != NULL, 0)) > return (*hook)(bytes, RETURN_ADDRESS (0)); > +#if USE_TCACHE > + /* int_free also calls request2size, be careful to not pad twice. */ > + size_t tbytes = request2size (bytes); > + size_t tc_idx = csize2tidx (tbytes); > + > + MAYBE_INIT_TCACHE (); > + > + if (tc_idx < mp_.tcache_max > + && tc_idx < TCACHE_IDX /* to appease gcc */ > + && tcache > + && tcache->entries[tc_idx] != NULL) > + { > + return tcache_get (tc_idx); > + } > +#endif > > arena_get (ar_ptr, bytes); > > @@ -2943,6 +3110,8 @@ __libc_free (void *mem) > return; > } > > + MAYBE_INIT_TCACHE (); > + > ar_ptr = arena_for_chunk (p); > _int_free (ar_ptr, p, 0); > } > @@ -2980,7 +3149,10 @@ __libc_realloc (void *oldmem, size_t bytes) > if (chunk_is_mmapped (oldp)) > ar_ptr = NULL; > else > - ar_ptr = arena_for_chunk (oldp); > + { > + MAYBE_INIT_TCACHE (); > + ar_ptr = arena_for_chunk (oldp); > + } > > /* Little security check which won't hurt performance: the allocator > never wrapps around at the end of the address space. Therefore > @@ -3205,6 +3377,8 @@ __libc_calloc (size_t n, size_t elem_size) > > sz = bytes; > > + MAYBE_INIT_TCACHE (); > + > arena_get (av, sz); > if (av) > { > @@ -3335,6 +3509,10 @@ _int_malloc (mstate av, size_t bytes) > mchunkptr fwd; /* misc temp for linking */ > mchunkptr bck; /* misc temp for linking */ > > +#if USE_TCACHE > + size_t tcache_unsorted_count; /* count of unsorted chunks processed */ > +#endif > + > const char *errstr = NULL; > > /* > @@ -3387,6 +3565,35 @@ _int_malloc (mstate av, size_t bytes) > return NULL; > } > check_remalloced_chunk (av, victim, nb); > +#if USE_TCACHE > + /* While we're here, if we see other chunks of the same size, > + stash them in the tcache. */ > + size_t tc_idx = csize2tidx (nb); > + if (tcache && tc_idx < mp_.tcache_max) > + { > + mchunkptr tc_victim; > + int found = 0; > + > + /* While bin not empty and tcache not full, copy chunks over. */ > + while (tcache->counts[tc_idx] < mp_.tcache_count > + && (pp = *fb) != NULL) > + { > + do > + { > + tc_victim = pp; > + if (tc_victim == NULL) > + break; > + } > + while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim)) > + != tc_victim); > + if (tc_victim != 0) > + { > + tcache_put (tc_victim, tc_idx); > + ++found; > + } > + } > + } > +#endif > void *p = chunk2mem (victim); > alloc_perturb (p, bytes); > return p; > @@ -3425,6 +3632,34 @@ _int_malloc (mstate av, size_t bytes) > if (av != &main_arena) > set_non_main_arena (victim); > check_malloced_chunk (av, victim, nb); > +#if USE_TCACHE > + /* While we're here, if we see other chunks of the same size, > + stash them in the tcache. */ > + size_t tc_idx = csize2tidx (nb); > + if (tcache && tc_idx < mp_.tcache_max) > + { > + mchunkptr tc_victim; > + int found = 0; > + > + /* While bin not empty and tcache not full, copy chunks over. */ > + while (tcache->counts[tc_idx] < mp_.tcache_count > + && (tc_victim = last (bin)) != bin) > + { > + if (tc_victim != 0) > + { > + bck = tc_victim->bk; > + set_inuse_bit_at_offset (tc_victim, nb); > + if (av != &main_arena) > + set_non_main_arena (tc_victim); > + bin->bk = bck; > + bck->fd = bin; > + > + tcache_put (tc_victim, tc_idx); > + ++found; > + } > + } > + } > +#endif > void *p = chunk2mem (victim); > alloc_perturb (p, bytes); > return p; > @@ -3463,6 +3698,16 @@ _int_malloc (mstate av, size_t bytes) > otherwise need to expand memory to service a "small" request. > */ > > +#if USE_TCACHE > + INTERNAL_SIZE_T tcache_nb = 0; > + size_t tc_idx = csize2tidx (nb); > + if (tcache && tc_idx < mp_.tcache_max) > + tcache_nb = nb; > + int return_cached = 0; > + > + tcache_unsorted_count = 0; > +#endif > + > for (;; ) > { > int iters = 0; > @@ -3523,10 +3768,26 @@ _int_malloc (mstate av, size_t bytes) > set_inuse_bit_at_offset (victim, size); > if (av != &main_arena) > set_non_main_arena (victim); > +#if USE_TCACHE > + /* Fill cache first, return to user only if cache fills. > + We may return one of these chunks later. */ > + if (tcache_nb > + && tcache->counts[tc_idx] < mp_.tcache_count) > + { > + tcache_put (victim, tc_idx); > + return_cached = 1; > + continue; > + } > + else > + { > +#endif > check_malloced_chunk (av, victim, nb); > void *p = chunk2mem (victim); > alloc_perturb (p, bytes); > return p; > +#if USE_TCACHE > + } > +#endif > } > > /* place chunk in bin */ > @@ -3593,11 +3854,31 @@ _int_malloc (mstate av, size_t bytes) > fwd->bk = victim; > bck->fd = victim; > > +#if USE_TCACHE > + /* If we've processed as many chunks as we're allowed while > + filling the cache, return one of the cached ones. */ > + ++tcache_unsorted_count; > + if (return_cached > + && mp_.tcache_unsorted_limit > 0 > + && tcache_unsorted_count > mp_.tcache_unsorted_limit) > + { > + return tcache_get (tc_idx); > + } > +#endif > + > #define MAX_ITERS 10000 > if (++iters >= MAX_ITERS) > break; > } > > +#if USE_TCACHE > + /* If all the small chunks we found ended up cached, return one now. */ > + if (return_cached) > + { > + return tcache_get (tc_idx); > + } > +#endif > + > /* > If a large request, scan through the chunks of current bin in > sorted order to find smallest that fits. Use the skip list for this. > @@ -3883,6 +4164,20 @@ _int_free (mstate av, mchunkptr p, int have_lock) > > check_inuse_chunk(av, p); > > +#if USE_TCACHE > + { > + size_t tc_idx = csize2tidx (size); > + > + if (tcache > + && tc_idx < mp_.tcache_max > + && tcache->counts[tc_idx] < mp_.tcache_count) > + { > + tcache_put (p, tc_idx); > + return; > + } > + } > +#endif > + > /* > If eligible, place chunk on a fastbin so it can be found > and used quickly in malloc. > @@ -4844,6 +5139,38 @@ do_set_arena_max (size_t value) > return 1; > } > > +#if USE_TCACHE > +static inline int > +__always_inline > +do_set_tcache_max (size_t value) > +{ > + LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes); > + if (value >= 0 && value <= MAX_TCACHE_SIZE) > + { > + mp_.tcache_max_bytes = value; > + mp_.tcache_max = usize2tidx (value) + 1; > + } > + return 1; > +} > + > +static inline int > +__always_inline > +do_set_tcache_count (size_t value) > +{ > + LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count); > + mp_.tcache_count = value; > + return 1; > +} > + > +static inline int > +__always_inline > +do_set_tcache_unsorted_limit (size_t value) > +{ > + LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit); > + mp_.tcache_unsorted_limit = value; > + return 1; > +} > +#endif > > int > __libc_mallopt (int param_number, int value) > @@ -4904,6 +5231,20 @@ __libc_mallopt (int param_number, int value) > if (value > 0) > do_set_arena_test (value); > break; > +#if USE_TCACHE > + case M_TCACHE_COUNT: > + if (value >= 0) > + do_set_tcache_count (value); > + break; > + case M_TCACHE_MAX: > + if (value >= 0) > + do_set_tcache_max (value); > + break; > + case M_TCACHE_UNSORTED_LIMIT: > + if (value >= 0) > + do_set_tcache_unsorted_limit (value); > + break; > +#endif > } > __libc_lock_unlock (av->mutex); > return res;
On 03/10/2017 12:02 AM, DJ Delorie wrote: > Branch updated... > > Fix whitespace around operators. > > Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper > > Add helper functions for tcache_get() and tcache_put() to collect > common code in one spot. > > Updated patch attached. (git diff origin/master...origin/dj/malloc-tcache) I'm very excited to see this go upstream. Overall this is looking better and almost there. I think one more version given these comments and I think we can consider this ready. Please review, integrate, and produce a v3 and repost it. Please TO: me so I can review again. > I also put a copy of the oocalc workload online, as a sample of the > workloads I've been using to benchmark. It's 16M compressed, and you > have to decompress it (to 65M) to use it with the simulator. > > http://www.delorie.com/tmp/oocalc.wl.bz2 > > (if you use wget, use "-U malloc" ;) High-level review: - Adding a thread local cache of chunks for better performance. OK. The reasons for tcache make sense, and the current workload benchmarks make sense to me. On the average having a pool of chunks per-thread to fetch quickly makes for better average performance across the workloads you have looked at. If people claim their workloads are impacted then the trace infrastructure can be used to capture and reproduce that workload to see what is going on. Design Review: - Adding legacy environment variables. Not OK. We have tunables framework now and we don't need legacy environment variables. Please remove the legacy environment variables and resubmit. The same applies for the private mallopt interface you were using. - Missing manual documentation for new tcache tunables. Not OK. Please add the tcache tunables to the manual in the tunables section and describe in detail what they do. I don't know if we can easily have a conditional section in the manual? I don't really care though, just describe the entries and they'll get ignored if tcache isn't enabled anyway. - Missing manual documentation for new probes. Not OK. I applaud the addition of new useful probes in the malloc subsystem. Please document the new probes and their interface in probes.texi. - Tunables security level? The malloc tunables should all be 'security_level: SXID_IGNORE'. - Code duplication? In _int_malloc we have some code duplication where tcache and the normal code both do the same thing e.g. remove a block. Can we avoid some of this duplication with a first refactoring patch? Low-level review: Follows... > diff --git a/config.make.in b/config.make.in > index 5836b32..0290d83 100644 > --- a/config.make.in > +++ b/config.make.in > @@ -77,6 +77,8 @@ multi-arch = @multi_arch@ > > mach-interface-list = @mach_interface_list@ > > +experimental-malloc = @experimental_malloc@ > + OK. > nss-crypt = @libc_cv_nss_crypt@ > static-nss-crypt = @libc_cv_static_nss_crypt@ > > diff --git a/configure.ac b/configure.ac > index 4a77411..b929012 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch], > [multi_arch=$enableval], > [multi_arch=default]) > > +AC_ARG_ENABLE([experimental-malloc], > + AC_HELP_STRING([--disable-experimental-malloc], > + [disable experimental malloc features]), > + [experimental_malloc=$enableval], > + [experimental_malloc=yes]) > +AC_SUBST(experimental_malloc) OK. Enabled by default is good IMO. > + > AC_ARG_ENABLE([nss-crypt], > AC_HELP_STRING([--enable-nss-crypt], > [enable libcrypt to use nss]), > diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list > index cb9e8f1..37620c8 100644 > --- a/elf/dl-tunables.list > +++ b/elf/dl-tunables.list > @@ -76,5 +76,17 @@ glibc { > minval: 1 > security_level: SXID_IGNORE > } > + tcache_max { > + type: SIZE_T > + env_alias: MALLOC_TCACHE_MAX > + } > + tcache_count { > + type: SIZE_T > + env_alias: MALLOC_TCACHE_COUNT > + } > + tcache_unsorted_limit { > + type: SIZE_T > + env_alias: MALLOC_TCACHE_UNSORTED_LIMIT Noted already that legacy env must be removed. Noted already that manual needs updating to describe these. Noted already these should be 'security_level: SXID_IGNORE' to avoid any impact on secure processes. Otherwise OK. > + } > } > } > diff --git a/malloc/Makefile b/malloc/Makefile > index e93b83b..dd8a43a 100644 > --- a/malloc/Makefile > +++ b/malloc/Makefile > @@ -168,6 +168,9 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV) > tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3 > tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV) > > +ifeq ($(experimental-malloc),yes) > +CPPFLAGS-malloc.c += -DUSE_TCACHE > +endif Must be -DUSE_TCACHE=1, please follow -Wundef rules: https://sourceware.org/glibc/wiki/Wundef Needs a else conditional to set -DUSE_TCACHE=0. > # Uncomment this for test releases. For public releases it is too expensive. > #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1 > > diff --git a/malloc/arena.c b/malloc/arena.c > index d49e4a2..79e918f 100644 > --- a/malloc/arena.c > +++ b/malloc/arena.c > @@ -236,6 +236,11 @@ DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t) > DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t) > DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t) > DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t) > +#if USE_TCACHE > +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t) > +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t) > +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t) > +#endif OK. > #else > /* Initialization routine. */ > #include <string.h> > @@ -322,6 +327,11 @@ ptmalloc_init (void) > TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max); > TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max); > TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test); > +#if USE_TCACHE > + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max); > + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count); > + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit); Overlong line. Please wrap. e.g. TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit); > +#endif OK. > __libc_lock_unlock (main_arena.mutex); > #else > const char *s = NULL; > @@ -371,7 +381,23 @@ ptmalloc_init (void) > if (memcmp (envline, "ARENA_TEST", 10) == 0) > __libc_mallopt (M_ARENA_TEST, atoi (&envline[11])); > } > +#if USE_TCACHE > + if (!__builtin_expect (__libc_enable_secure, 0)) > + { > + if (memcmp (envline, "TCACHE_MAX", 10) == 0) > + __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11])); > + } > +#endif Remove. > break; > +#if USE_TCACHE > + case 12: > + if (!__builtin_expect (__libc_enable_secure, 0)) > + { > + if (memcmp (envline, "TCACHE_COUNT", 12) == 0) > + __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13])); > + } > + break; > +#endif Remove. > case 15: > if (!__builtin_expect (__libc_enable_secure, 0)) > { > @@ -381,6 +407,15 @@ ptmalloc_init (void) > __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16])); > } > break; > +#if USE_TCACHE > + case 21: > + if (!__builtin_expect (__libc_enable_secure, 0)) > + { > + if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0) > + __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22])); > + } > + break; > +#endif Remove. > default: > break; > } > diff --git a/malloc/malloc.c b/malloc/malloc.c > index e29105c..8cd03d8 100644 > --- a/malloc/malloc.c > +++ b/malloc/malloc.c > @@ -297,6 +297,33 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line, > } > #endif > > +#ifndef USE_TCACHE > +# define USE_TCACHE 0 > +#endif Remove, not needed because you set it 0 or 1 in the makefile. Avoids macro-api pitfalls of accidentally undefined USE_TCACHE. > +#if USE_TCACHE > +/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */ > +# define MAX_TCACHE_SIZE (MALLOC_ALIGNMENT * 63) Why use 63? Is it because 0 is reserved for the empty zero byte set? If so then just say "We want 63 non-zero sized entries."? > +# define TCACHE_IDX ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1) Since this is the maximum index can we call this TCACHE_MAX_IDX? > +# define size2tidx_(bytes) (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT) Why the trailing underscore? > + > +# define tidx2csize(idx) ((idx) * MALLOC_ALIGNMENT + SIZE_SZ) Unused, but OK from a symmetry perspective for debug uses. > +# define tidx2usize(idx) ((idx) * MALLOC_ALIGNMENT) > + > +/* When "x" is a user-provided size. */ > +# define usize2tidx(x) size2tidx_ (x) > +/* When "x" is from chunksize(). */ > +# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ) > + > +/* Rounds up, so... > + idx 0 bytes 0 > + idx 1 bytes 1..8 > + idx 2 bytes 9..16 > + etc. */ This doesn't look correct for a 64-bit x86_64 system (maybe 32-bit?). Could you please adjust the comment for 64-bit x86_64 where MALLOC_ALIGNMENT should be 16 bytes, and indicate the comment is for that given machine. > + > +/* This is another arbitrary limit, which tunables can change. */ > +# define TCACHE_FILL_COUNT 7 Comment should describe in more detail what a developer should need to know about this. > +#endif > + > > /* > REALLOC_ZERO_BYTES_FREES should be set if a call to > @@ -1711,6 +1738,17 @@ struct malloc_par > > /* First address handed out by MORECORE/sbrk. */ > char *sbrk_base; > + > +#if USE_TCACHE > + /* Maximum number of buckets to use. */ > + size_t tcache_max; Can we call this tcache_max_idx to match the macro constant? > + size_t tcache_max_bytes; > + /* Maximum number of chunks in each bucket. */ > + size_t tcache_count; > + /* Maximum number of chunks to remove from the unsorted list, which > + don't match. */ Which don't match what? > + size_t tcache_unsorted_limit; > +#endif > }; > > /* There are several instances of this struct ("arenas") in this > @@ -1749,8 +1787,22 @@ static struct malloc_par mp_ = > .trim_threshold = DEFAULT_TRIM_THRESHOLD, > #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8)) > .arena_test = NARENAS_FROM_NCORES (1) > +#if USE_TCACHE > + , > + .tcache_count = TCACHE_FILL_COUNT, > + .tcache_max = TCACHE_IDX, s/.tcache_max = TCACHE_IDX/.tcache_max_idx = TCACHE_MAX_IDX/g > + .tcache_max_bytes = tidx2usize (TCACHE_IDX-1), s/TCACHE_IDX-1/TCACHE - 1/g > + .tcache_unsorted_limit = 0 /* No limit */ s,/* No limit */,/* No limit. */ > +#endif > }; > > +/* Non public mallopt parameters. */ > +#if USE_TCACHE > +# define M_TCACHE_COUNT -9 > +# define M_TCACHE_MAX -10 > +# define M_TCACHE_UNSORTED_LIMIT -11 Remove. Not needed anymore. > +#endif > + > /* Maximum size of memory handled in fastbins. */ > static INTERNAL_SIZE_T global_max_fast; > > @@ -2874,6 +2926,106 @@ mremap_chunk (mchunkptr p, size_t new_size) > > /*------------------------ Public wrappers. --------------------------------*/ > > +#if USE_TCACHE > + > +typedef struct TCacheEntry { > + struct TCacheEntry *next; > +} TCacheEntry; Needs explaining in detail that these apparently incomplete types are layered on top of a chunk. Otherwise OK. > + > +/* There is one of these for each thread, which contains the > + per-thread cache (hence "TCache"). Keeping overall size low is > + mildly important. Note that COUNTS and ENTRIES are redundant, this > + is for performance reasons. */ Why do you say they are redundant? Because you could in theory have a thread local pointer to TCacheEntry and walk that list when a malloc request arrives? If so, please expand the comment to cover that. > +typedef struct TCache { > + char counts[TCACHE_IDX]; > + TCacheEntry *entries[TCACHE_IDX]; > +} TCache; OK. > + > +static __thread char tcache_shutting_down = 0; OK. > +static __thread TCache *tcache = NULL; > + > +static void > +tcache_put (mchunkptr chunk, size_t tc_idx) > +{ > + TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk); Can we assert the safety of tc_idx? assert (tc_idx < TCACHE_IDX); ? > + e->next = tcache->entries[tc_idx]; > + tcache->entries[tc_idx] = e; > + ++(tcache->counts[tc_idx]); What prevents count overflow? > +} > + > +static void * > +tcache_get (size_t tc_idx) > +{ Can we assert the safety of tc_idx? > + TCacheEntry *e = tcache->entries[tc_idx]; > + tcache->entries[tc_idx] = e->next; > + --(tcache->counts[tc_idx]); What prevents count underflow? > + return (void *) e; > +} > + > +static void __attribute__ ((section ("__libc_thread_freeres_fn"))) > +tcache_thread_freeres (void) > +{ > + int i; > + TCache *tcache_tmp = tcache; > + > + if (!tcache) > + return; > + > + tcache = NULL; > + > + for (i = 0; i < TCACHE_IDX; ++i) { > + while (tcache_tmp->entries[i]) > + { > + TCacheEntry *e = tcache_tmp->entries[i]; > + tcache_tmp->entries[i] = e->next; > + __libc_free (e); > + } > + } > + > + __libc_free (tcache_tmp); > + > + tcache_shutting_down = 1; > +} OK. > +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres); > + > +static void > +tcache_init(void) > +{ > + mstate ar_ptr; > + void *victim = 0; > + const size_t bytes = sizeof (TCache); > + > + if (tcache_shutting_down) > + return; > + > + arena_get (ar_ptr, bytes); > + victim = _int_malloc (ar_ptr, bytes); > + if (!victim && ar_ptr != NULL) > + { > + ar_ptr = arena_get_retry (ar_ptr, bytes); > + victim = _int_malloc (ar_ptr, bytes); > + } > + > + > + if (ar_ptr != NULL) > + __libc_lock_unlock (ar_ptr->mutex); OK, unlock because arena_get_retry returns a locked arena. > + > + if (victim) > + { > + tcache = (TCache *) victim; > + memset (tcache, 0, sizeof (TCache)); > + } > + Please add a comment here that in a low-memory condition we will have victim == NULL and tcache == NULL and keep calling tcache_init at every opportunity to eventually get enough memory for the tcache. > +} > + > +#define MAYBE_INIT_TCACHE() \ > + if (__glibc_unlikely (tcache == NULL)) \ > + tcache_init(); > + > +#else > +#define MAYBE_INIT_TCACHE() > +#endif > + OK. > void * > __libc_malloc (size_t bytes) > { > @@ -2884,6 +3036,21 @@ __libc_malloc (size_t bytes) > = atomic_forced_read (__malloc_hook); > if (__builtin_expect (hook != NULL, 0)) > return (*hook)(bytes, RETURN_ADDRESS (0)); > +#if USE_TCACHE > + /* int_free also calls request2size, be careful to not pad twice. */ > + size_t tbytes = request2size (bytes); > + size_t tc_idx = csize2tidx (tbytes); > + > + MAYBE_INIT_TCACHE (); > + > + if (tc_idx < mp_.tcache_max > + && tc_idx < TCACHE_IDX /* to appease gcc */ > + && tcache > + && tcache->entries[tc_idx] != NULL) > + { > + return tcache_get (tc_idx); > + } > +#endif OK. > > arena_get (ar_ptr, bytes); > > @@ -2943,6 +3110,8 @@ __libc_free (void *mem) > return; > } > > + MAYBE_INIT_TCACHE (); OK. > + > ar_ptr = arena_for_chunk (p); > _int_free (ar_ptr, p, 0); > } > @@ -2980,7 +3149,10 @@ __libc_realloc (void *oldmem, size_t bytes) > if (chunk_is_mmapped (oldp)) > ar_ptr = NULL; > else > - ar_ptr = arena_for_chunk (oldp); > + { > + MAYBE_INIT_TCACHE (); > + ar_ptr = arena_for_chunk (oldp); OK. > + } > > /* Little security check which won't hurt performance: the allocator > never wrapps around at the end of the address space. Therefore > @@ -3205,6 +3377,8 @@ __libc_calloc (size_t n, size_t elem_size) > > sz = bytes; > > + MAYBE_INIT_TCACHE (); > + OK. > arena_get (av, sz); > if (av) > { > @@ -3335,6 +3509,10 @@ _int_malloc (mstate av, size_t bytes) > mchunkptr fwd; /* misc temp for linking */ > mchunkptr bck; /* misc temp for linking */ > > +#if USE_TCACHE > + size_t tcache_unsorted_count; /* count of unsorted chunks processed */ > +#endif OK. > + > const char *errstr = NULL; > > /* > @@ -3387,6 +3565,35 @@ _int_malloc (mstate av, size_t bytes) > return NULL; > } > check_remalloced_chunk (av, victim, nb); > +#if USE_TCACHE > + /* While we're here, if we see other chunks of the same size, > + stash them in the tcache. */ > + size_t tc_idx = csize2tidx (nb); > + if (tcache && tc_idx < mp_.tcache_max) > + { > + mchunkptr tc_victim; > + int found = 0; Unused 'found' > + > + /* While bin not empty and tcache not full, copy chunks over. */ > + while (tcache->counts[tc_idx] < mp_.tcache_count > + && (pp = *fb) != NULL) OK, start at the fastbin associated with the allocation request size. > + { > + do > + { > + tc_victim = pp; > + if (tc_victim == NULL) > + break; > + } > + while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim)) > + != tc_victim); OK, try to remove the fastbin ... > + if (tc_victim != 0) > + { > + tcache_put (tc_victim, tc_idx); ... and if you succeed then put it in the tcache. > + ++found; Remove (unused). > + } > + } > + } > +#endif This block of code is very very similar to the fastbin block just above it. Both blocks of code try to take a chunk out of a similar list. Is there any way we can consolodate this? Perhaps post a first patch which factorizes away the "get a chunk from the fastbin list" and then tcache would be smaller here. > void *p = chunk2mem (victim); > alloc_perturb (p, bytes); > return p; > @@ -3425,6 +3632,34 @@ _int_malloc (mstate av, size_t bytes) > if (av != &main_arena) > set_non_main_arena (victim); > check_malloced_chunk (av, victim, nb); > +#if USE_TCACHE > + /* While we're here, if we see other chunks of the same size, > + stash them in the tcache. */ Same interaction as above but for non-fast bins. > + size_t tc_idx = csize2tidx (nb); > + if (tcache && tc_idx < mp_.tcache_max) > + { > + mchunkptr tc_victim; > + int found = 0; Unused. > + > + /* While bin not empty and tcache not full, copy chunks over. */ > + while (tcache->counts[tc_idx] < mp_.tcache_count > + && (tc_victim = last (bin)) != bin) > + { > + if (tc_victim != 0) > + { > + bck = tc_victim->bk; > + set_inuse_bit_at_offset (tc_victim, nb); > + if (av != &main_arena) > + set_non_main_arena (tc_victim); > + bin->bk = bck; > + bck->fd = bin; OK. > + > + tcache_put (tc_victim, tc_idx); > + ++found; Remove (unused). > + } > + } > + } > +#endif > void *p = chunk2mem (victim); > alloc_perturb (p, bytes); > return p; > @@ -3463,6 +3698,16 @@ _int_malloc (mstate av, size_t bytes) > otherwise need to expand memory to service a "small" request. > */ > > +#if USE_TCACHE > + INTERNAL_SIZE_T tcache_nb = 0; > + size_t tc_idx = csize2tidx (nb); > + if (tcache && tc_idx < mp_.tcache_max) > + tcache_nb = nb; > + int return_cached = 0; > + > + tcache_unsorted_count = 0; We selectively apply this limit to just unsorted, but why not all the other places where we iterate filling the cache? Because in the other cases we have perfect size matches we know we want? > +#endif OK. > + > for (;; ) > { > int iters = 0; > @@ -3523,10 +3768,26 @@ _int_malloc (mstate av, size_t bytes) > set_inuse_bit_at_offset (victim, size); > if (av != &main_arena) > set_non_main_arena (victim); > +#if USE_TCACHE > + /* Fill cache first, return to user only if cache fills. > + We may return one of these chunks later. */ > + if (tcache_nb > + && tcache->counts[tc_idx] < mp_.tcache_count) > + { > + tcache_put (victim, tc_idx); > + return_cached = 1; > + continue; > + } > + else > + { > +#endif OK. > check_malloced_chunk (av, victim, nb); > void *p = chunk2mem (victim); > alloc_perturb (p, bytes); > return p; > +#if USE_TCACHE > + } > +#endif > } > > /* place chunk in bin */ > @@ -3593,11 +3854,31 @@ _int_malloc (mstate av, size_t bytes) > fwd->bk = victim; > bck->fd = victim; > > +#if USE_TCACHE > + /* If we've processed as many chunks as we're allowed while > + filling the cache, return one of the cached ones. */ > + ++tcache_unsorted_count; > + if (return_cached > + && mp_.tcache_unsorted_limit > 0 > + && tcache_unsorted_count > mp_.tcache_unsorted_limit) > + { > + return tcache_get (tc_idx); OK. > + } > +#endif > + > #define MAX_ITERS 10000 > if (++iters >= MAX_ITERS) > break; > } > > +#if USE_TCACHE > + /* If all the small chunks we found ended up cached, return one now. */ > + if (return_cached) > + { > + return tcache_get (tc_idx); OK. > + } > +#endif > + > /* > If a large request, scan through the chunks of current bin in > sorted order to find smallest that fits. Use the skip list for this. > @@ -3883,6 +4164,20 @@ _int_free (mstate av, mchunkptr p, int have_lock) > > check_inuse_chunk(av, p); > > +#if USE_TCACHE > + { > + size_t tc_idx = csize2tidx (size); > + > + if (tcache > + && tc_idx < mp_.tcache_max > + && tcache->counts[tc_idx] < mp_.tcache_count) > + { > + tcache_put (p, tc_idx); > + return; OK. > + } > + } > +#endif > + > /* > If eligible, place chunk on a fastbin so it can be found > and used quickly in malloc. > @@ -4844,6 +5139,38 @@ do_set_arena_max (size_t value) > return 1; > } > > +#if USE_TCACHE > +static inline int > +__always_inline > +do_set_tcache_max (size_t value) > +{ > + LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes); s/memory_mallopt_tcache_max_bytes/memory_set_tcache_max_bytes/g > + if (value >= 0 && value <= MAX_TCACHE_SIZE) > + { > + mp_.tcache_max_bytes = value; > + mp_.tcache_max = usize2tidx (value) + 1; > + } > + return 1; > +} > + > +static inline int > +__always_inline > +do_set_tcache_count (size_t value) > +{ > + LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count); Likewise. > + mp_.tcache_count = value; > + return 1; > +} > + > +static inline int > +__always_inline > +do_set_tcache_unsorted_limit (size_t value) > +{ > + LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit); Likewise. > + mp_.tcache_unsorted_limit = value; > + return 1; > +} > +#endif > > int > __libc_mallopt (int param_number, int value) > @@ -4904,6 +5231,20 @@ __libc_mallopt (int param_number, int value) > if (value > 0) > do_set_arena_test (value); > break; > +#if USE_TCACHE > + case M_TCACHE_COUNT: > + if (value >= 0) > + do_set_tcache_count (value); > + break; > + case M_TCACHE_MAX: > + if (value >= 0) > + do_set_tcache_max (value); > + break; > + case M_TCACHE_UNSORTED_LIMIT: > + if (value >= 0) > + do_set_tcache_unsorted_limit (value); > + break; > +#endif Remove. > } > __libc_lock_unlock (av->mutex); > return res; >
On 03/10/2017 06:02 AM, DJ Delorie wrote: > +typedef struct TCacheEntry { > + struct TCacheEntry *next; > +} TCacheEntry; > + > +/* There is one of these for each thread, which contains the > + per-thread cache (hence "TCache"). Keeping overall size low is > + mildly important. Note that COUNTS and ENTRIES are redundant, this > + is for performance reasons. */ > +typedef struct TCache { > + char counts[TCACHE_IDX]; > + TCacheEntry *entries[TCACHE_IDX]; > +} TCache; These type names remind me of Turbo Vision. I don't think we use the CamelCase convention in glibc. :) I would like to see some discussion of the per-thread overhead the thread cache introduces, maybe along the documentation of the tunables, both the static overhead (due to new per-thread state) and some bounds on the additional per-thread allocation retention. Thanks, Florian
diff --git a/config.make.in b/config.make.in index 5836b32..0290d83 100644 --- a/config.make.in +++ b/config.make.in @@ -77,6 +77,8 @@ multi-arch = @multi_arch@ mach-interface-list = @mach_interface_list@ +experimental-malloc = @experimental_malloc@ + nss-crypt = @libc_cv_nss_crypt@ static-nss-crypt = @libc_cv_static_nss_crypt@ diff --git a/configure.ac b/configure.ac index 4a77411..b929012 100644 --- a/configure.ac +++ b/configure.ac @@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch], [multi_arch=$enableval], [multi_arch=default]) +AC_ARG_ENABLE([experimental-malloc], + AC_HELP_STRING([--disable-experimental-malloc], + [disable experimental malloc features]), + [experimental_malloc=$enableval], + [experimental_malloc=yes]) +AC_SUBST(experimental_malloc) + AC_ARG_ENABLE([nss-crypt], AC_HELP_STRING([--enable-nss-crypt], [enable libcrypt to use nss]), diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list index cb9e8f1..37620c8 100644 --- a/elf/dl-tunables.list +++ b/elf/dl-tunables.list @@ -76,5 +76,17 @@ glibc { minval: 1 security_level: SXID_IGNORE } + tcache_max { + type: SIZE_T + env_alias: MALLOC_TCACHE_MAX + } + tcache_count { + type: SIZE_T + env_alias: MALLOC_TCACHE_COUNT + } + tcache_unsorted_limit { + type: SIZE_T + env_alias: MALLOC_TCACHE_UNSORTED_LIMIT + } } } diff --git a/malloc/Makefile b/malloc/Makefile index e93b83b..dd8a43a 100644 --- a/malloc/Makefile +++ b/malloc/Makefile @@ -168,6 +168,9 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV) tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3 tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV) +ifeq ($(experimental-malloc),yes) +CPPFLAGS-malloc.c += -DUSE_TCACHE +endif # Uncomment this for test releases. For public releases it is too expensive. #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1 diff --git a/malloc/arena.c b/malloc/arena.c index d49e4a2..79e918f 100644 --- a/malloc/arena.c +++ b/malloc/arena.c @@ -236,6 +236,11 @@ DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t) DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t) DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t) DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t) +#if USE_TCACHE +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t) +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t) +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t) +#endif #else /* Initialization routine. */ #include <string.h> @@ -322,6 +327,11 @@ ptmalloc_init (void) TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max); TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max); TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test); +#if USE_TCACHE + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max); + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count); + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit); +#endif __libc_lock_unlock (main_arena.mutex); #else const char *s = NULL; @@ -371,7 +381,23 @@ ptmalloc_init (void) if (memcmp (envline, "ARENA_TEST", 10) == 0) __libc_mallopt (M_ARENA_TEST, atoi (&envline[11])); } +#if USE_TCACHE + if (!__builtin_expect (__libc_enable_secure, 0)) + { + if (memcmp (envline, "TCACHE_MAX", 10) == 0) + __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11])); + } +#endif break; +#if USE_TCACHE + case 12: + if (!__builtin_expect (__libc_enable_secure, 0)) + { + if (memcmp (envline, "TCACHE_COUNT", 12) == 0) + __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13])); + } + break; +#endif case 15: if (!__builtin_expect (__libc_enable_secure, 0)) { @@ -381,6 +407,15 @@ ptmalloc_init (void) __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16])); } break; +#if USE_TCACHE + case 21: + if (!__builtin_expect (__libc_enable_secure, 0)) + { + if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0) + __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22])); + } + break; +#endif default: break; } diff --git a/malloc/malloc.c b/malloc/malloc.c index e29105c..8cd03d8 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -297,6 +297,33 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line, } #endif +#ifndef USE_TCACHE +# define USE_TCACHE 0 +#endif +#if USE_TCACHE +/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */ +# define MAX_TCACHE_SIZE (MALLOC_ALIGNMENT * 63) +# define TCACHE_IDX ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1) +# define size2tidx_(bytes) (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT) + +# define tidx2csize(idx) ((idx) * MALLOC_ALIGNMENT + SIZE_SZ) +# define tidx2usize(idx) ((idx) * MALLOC_ALIGNMENT) + +/* When "x" is a user-provided size. */ +# define usize2tidx(x) size2tidx_ (x) +/* When "x" is from chunksize(). */ +# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ) + +/* Rounds up, so... + idx 0 bytes 0 + idx 1 bytes 1..8 + idx 2 bytes 9..16 + etc. */ + +/* This is another arbitrary limit, which tunables can change. */ +# define TCACHE_FILL_COUNT 7 +#endif + /* REALLOC_ZERO_BYTES_FREES should be set if a call to @@ -1711,6 +1738,17 @@ struct malloc_par /* First address handed out by MORECORE/sbrk. */ char *sbrk_base; + +#if USE_TCACHE + /* Maximum number of buckets to use. */ + size_t tcache_max; + size_t tcache_max_bytes; + /* Maximum number of chunks in each bucket. */ + size_t tcache_count; + /* Maximum number of chunks to remove from the unsorted list, which + don't match. */ + size_t tcache_unsorted_limit; +#endif }; /* There are several instances of this struct ("arenas") in this @@ -1749,8 +1787,22 @@ static struct malloc_par mp_ = .trim_threshold = DEFAULT_TRIM_THRESHOLD, #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8)) .arena_test = NARENAS_FROM_NCORES (1) +#if USE_TCACHE + , + .tcache_count = TCACHE_FILL_COUNT, + .tcache_max = TCACHE_IDX, + .tcache_max_bytes = tidx2usize (TCACHE_IDX-1), + .tcache_unsorted_limit = 0 /* No limit */ +#endif }; +/* Non public mallopt parameters. */ +#if USE_TCACHE +# define M_TCACHE_COUNT -9 +# define M_TCACHE_MAX -10 +# define M_TCACHE_UNSORTED_LIMIT -11 +#endif + /* Maximum size of memory handled in fastbins. */ static INTERNAL_SIZE_T global_max_fast; @@ -2874,6 +2926,106 @@ mremap_chunk (mchunkptr p, size_t new_size) /*------------------------ Public wrappers. --------------------------------*/ +#if USE_TCACHE + +typedef struct TCacheEntry { + struct TCacheEntry *next; +} TCacheEntry; + +/* There is one of these for each thread, which contains the + per-thread cache (hence "TCache"). Keeping overall size low is + mildly important. Note that COUNTS and ENTRIES are redundant, this + is for performance reasons. */ +typedef struct TCache { + char counts[TCACHE_IDX]; + TCacheEntry *entries[TCACHE_IDX]; +} TCache; + +static __thread char tcache_shutting_down = 0; +static __thread TCache *tcache = NULL; + +static void +tcache_put (mchunkptr chunk, size_t tc_idx) +{ + TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk); + e->next = tcache->entries[tc_idx]; + tcache->entries[tc_idx] = e; + ++(tcache->counts[tc_idx]); +} + +static void * +tcache_get (size_t tc_idx) +{ + TCacheEntry *e = tcache->entries[tc_idx]; + tcache->entries[tc_idx] = e->next; + --(tcache->counts[tc_idx]); + return (void *) e; +} + +static void __attribute__ ((section ("__libc_thread_freeres_fn"))) +tcache_thread_freeres (void) +{ + int i; + TCache *tcache_tmp = tcache; + + if (!tcache) + return; + + tcache = NULL; + + for (i = 0; i < TCACHE_IDX; ++i) { + while (tcache_tmp->entries[i]) + { + TCacheEntry *e = tcache_tmp->entries[i]; + tcache_tmp->entries[i] = e->next; + __libc_free (e); + } + } + + __libc_free (tcache_tmp); + + tcache_shutting_down = 1; +} +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres); + +static void +tcache_init(void) +{ + mstate ar_ptr; + void *victim = 0; + const size_t bytes = sizeof (TCache); + + if (tcache_shutting_down) + return; + + arena_get (ar_ptr, bytes); + victim = _int_malloc (ar_ptr, bytes); + if (!victim && ar_ptr != NULL) + { + ar_ptr = arena_get_retry (ar_ptr, bytes); + victim = _int_malloc (ar_ptr, bytes); + } + + + if (ar_ptr != NULL) + __libc_lock_unlock (ar_ptr->mutex); + + if (victim) + { + tcache = (TCache *) victim; + memset (tcache, 0, sizeof (TCache)); + } + +} + +#define MAYBE_INIT_TCACHE() \ + if (__glibc_unlikely (tcache == NULL)) \ + tcache_init(); + +#else +#define MAYBE_INIT_TCACHE() +#endif + void * __libc_malloc (size_t bytes) { @@ -2884,6 +3036,21 @@ __libc_malloc (size_t bytes) = atomic_forced_read (__malloc_hook); if (__builtin_expect (hook != NULL, 0)) return (*hook)(bytes, RETURN_ADDRESS (0)); +#if USE_TCACHE + /* int_free also calls request2size, be careful to not pad twice. */ + size_t tbytes = request2size (bytes); + size_t tc_idx = csize2tidx (tbytes); + + MAYBE_INIT_TCACHE (); + + if (tc_idx < mp_.tcache_max + && tc_idx < TCACHE_IDX /* to appease gcc */ + && tcache + && tcache->entries[tc_idx] != NULL) + { + return tcache_get (tc_idx); + } +#endif arena_get (ar_ptr, bytes); @@ -2943,6 +3110,8 @@ __libc_free (void *mem) return; } + MAYBE_INIT_TCACHE (); + ar_ptr = arena_for_chunk (p); _int_free (ar_ptr, p, 0); } @@ -2980,7 +3149,10 @@ __libc_realloc (void *oldmem, size_t bytes) if (chunk_is_mmapped (oldp)) ar_ptr = NULL; else - ar_ptr = arena_for_chunk (oldp); + { + MAYBE_INIT_TCACHE (); + ar_ptr = arena_for_chunk (oldp); + } /* Little security check which won't hurt performance: the allocator never wrapps around at the end of the address space. Therefore @@ -3205,6 +3377,8 @@ __libc_calloc (size_t n, size_t elem_size) sz = bytes; + MAYBE_INIT_TCACHE (); + arena_get (av, sz); if (av) { @@ -3335,6 +3509,10 @@ _int_malloc (mstate av, size_t bytes) mchunkptr fwd; /* misc temp for linking */ mchunkptr bck; /* misc temp for linking */ +#if USE_TCACHE + size_t tcache_unsorted_count; /* count of unsorted chunks processed */ +#endif + const char *errstr = NULL; /* @@ -3387,6 +3565,35 @@ _int_malloc (mstate av, size_t bytes) return NULL; } check_remalloced_chunk (av, victim, nb); +#if USE_TCACHE + /* While we're here, if we see other chunks of the same size, + stash them in the tcache. */ + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_max) + { + mchunkptr tc_victim; + int found = 0; + + /* While bin not empty and tcache not full, copy chunks over. */ + while (tcache->counts[tc_idx] < mp_.tcache_count + && (pp = *fb) != NULL) + { + do + { + tc_victim = pp; + if (tc_victim == NULL) + break; + } + while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim)) + != tc_victim); + if (tc_victim != 0) + { + tcache_put (tc_victim, tc_idx); + ++found; + } + } + } +#endif void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; @@ -3425,6 +3632,34 @@ _int_malloc (mstate av, size_t bytes) if (av != &main_arena) set_non_main_arena (victim); check_malloced_chunk (av, victim, nb); +#if USE_TCACHE + /* While we're here, if we see other chunks of the same size, + stash them in the tcache. */ + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_max) + { + mchunkptr tc_victim; + int found = 0; + + /* While bin not empty and tcache not full, copy chunks over. */ + while (tcache->counts[tc_idx] < mp_.tcache_count + && (tc_victim = last (bin)) != bin) + { + if (tc_victim != 0) + { + bck = tc_victim->bk; + set_inuse_bit_at_offset (tc_victim, nb); + if (av != &main_arena) + set_non_main_arena (tc_victim); + bin->bk = bck; + bck->fd = bin; + + tcache_put (tc_victim, tc_idx); + ++found; + } + } + } +#endif void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; @@ -3463,6 +3698,16 @@ _int_malloc (mstate av, size_t bytes) otherwise need to expand memory to service a "small" request. */ +#if USE_TCACHE + INTERNAL_SIZE_T tcache_nb = 0; + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_max) + tcache_nb = nb; + int return_cached = 0; + + tcache_unsorted_count = 0; +#endif + for (;; ) { int iters = 0; @@ -3523,10 +3768,26 @@ _int_malloc (mstate av, size_t bytes) set_inuse_bit_at_offset (victim, size); if (av != &main_arena) set_non_main_arena (victim); +#if USE_TCACHE + /* Fill cache first, return to user only if cache fills. + We may return one of these chunks later. */ + if (tcache_nb + && tcache->counts[tc_idx] < mp_.tcache_count) + { + tcache_put (victim, tc_idx); + return_cached = 1; + continue; + } + else + { +#endif check_malloced_chunk (av, victim, nb); void *p = chunk2mem (victim); alloc_perturb (p, bytes); return p; +#if USE_TCACHE + } +#endif } /* place chunk in bin */ @@ -3593,11 +3854,31 @@ _int_malloc (mstate av, size_t bytes) fwd->bk = victim; bck->fd = victim; +#if USE_TCACHE + /* If we've processed as many chunks as we're allowed while + filling the cache, return one of the cached ones. */ + ++tcache_unsorted_count; + if (return_cached + && mp_.tcache_unsorted_limit > 0 + && tcache_unsorted_count > mp_.tcache_unsorted_limit) + { + return tcache_get (tc_idx); + } +#endif + #define MAX_ITERS 10000 if (++iters >= MAX_ITERS) break; } +#if USE_TCACHE + /* If all the small chunks we found ended up cached, return one now. */ + if (return_cached) + { + return tcache_get (tc_idx); + } +#endif + /* If a large request, scan through the chunks of current bin in sorted order to find smallest that fits. Use the skip list for this. @@ -3883,6 +4164,20 @@ _int_free (mstate av, mchunkptr p, int have_lock) check_inuse_chunk(av, p); +#if USE_TCACHE + { + size_t tc_idx = csize2tidx (size); + + if (tcache + && tc_idx < mp_.tcache_max + && tcache->counts[tc_idx] < mp_.tcache_count) + { + tcache_put (p, tc_idx); + return; + } + } +#endif + /* If eligible, place chunk on a fastbin so it can be found and used quickly in malloc. @@ -4844,6 +5139,38 @@ do_set_arena_max (size_t value) return 1; } +#if USE_TCACHE +static inline int +__always_inline +do_set_tcache_max (size_t value) +{ + LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes); + if (value >= 0 && value <= MAX_TCACHE_SIZE) + { + mp_.tcache_max_bytes = value; + mp_.tcache_max = usize2tidx (value) + 1; + } + return 1; +} + +static inline int +__always_inline +do_set_tcache_count (size_t value) +{ + LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count); + mp_.tcache_count = value; + return 1; +} + +static inline int +__always_inline +do_set_tcache_unsorted_limit (size_t value) +{ + LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit); + mp_.tcache_unsorted_limit = value; + return 1; +} +#endif int __libc_mallopt (int param_number, int value) @@ -4904,6 +5231,20 @@ __libc_mallopt (int param_number, int value) if (value > 0) do_set_arena_test (value); break; +#if USE_TCACHE + case M_TCACHE_COUNT: + if (value >= 0) + do_set_tcache_count (value); + break; + case M_TCACHE_MAX: + if (value >= 0) + do_set_tcache_max (value); + break; + case M_TCACHE_UNSORTED_LIMIT: + if (value >= 0) + do_set_tcache_unsorted_limit (value); + break; +#endif } __libc_lock_unlock (av->mutex); return res;