[updated] malloc per-thread cache ready for review

Submitted by DJ Delorie on March 10, 2017, 5:02 a.m.

Details

Message ID xna88tzpuu.fsf@greed.delorie.com
State New
Headers show

Commit Message

DJ Delorie March 10, 2017, 5:02 a.m.
Branch updated...

    Fix whitespace around operators.
    
    Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper
    
    Add helper functions for tcache_get() and tcache_put() to collect
    common code in one spot.

Updated patch attached.  (git diff origin/master...origin/dj/malloc-tcache)

I also put a copy of the oocalc workload online, as a sample of the
workloads I've been using to benchmark.  It's 16M compressed, and you
have to decompress it (to 65M) to use it with the simulator.

  http://www.delorie.com/tmp/oocalc.wl.bz2

(if you use wget, use "-U malloc" ;)

Comments

Will Newton March 17, 2017, 10:49 a.m.
On Fri, Mar 10, 2017 at 5:02 AM, DJ Delorie <dj@redhat.com> wrote:
>
> Branch updated...
>
>     Fix whitespace around operators.
>
>     Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper
>
>     Add helper functions for tcache_get() and tcache_put() to collect
>     common code in one spot.
>
> Updated patch attached.  (git diff origin/master...origin/dj/malloc-tcache)

This looks ok to me from a basic review - the code looks reasonable,
no testsuite regressions or warnings added.

It could do with a ChangeLog and NEWS entry though.

It would be good to get wider testing of the performance on
performance critical workloads that other people may have.

> I also put a copy of the oocalc workload online, as a sample of the
> workloads I've been using to benchmark.  It's 16M compressed, and you
> have to decompress it (to 65M) to use it with the simulator.
>
>   http://www.delorie.com/tmp/oocalc.wl.bz2
>
> (if you use wget, use "-U malloc" ;)
>
> diff --git a/config.make.in b/config.make.in
> index 5836b32..0290d83 100644
> --- a/config.make.in
> +++ b/config.make.in
> @@ -77,6 +77,8 @@ multi-arch = @multi_arch@
>
>  mach-interface-list = @mach_interface_list@
>
> +experimental-malloc = @experimental_malloc@
> +
>  nss-crypt = @libc_cv_nss_crypt@
>  static-nss-crypt = @libc_cv_static_nss_crypt@
>
> diff --git a/configure.ac b/configure.ac
> index 4a77411..b929012 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch],
>               [multi_arch=$enableval],
>               [multi_arch=default])
>
> +AC_ARG_ENABLE([experimental-malloc],
> +             AC_HELP_STRING([--disable-experimental-malloc],
> +                            [disable experimental malloc features]),
> +             [experimental_malloc=$enableval],
> +             [experimental_malloc=yes])
> +AC_SUBST(experimental_malloc)
> +
>  AC_ARG_ENABLE([nss-crypt],
>               AC_HELP_STRING([--enable-nss-crypt],
>                              [enable libcrypt to use nss]),
> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
> index cb9e8f1..37620c8 100644
> --- a/elf/dl-tunables.list
> +++ b/elf/dl-tunables.list
> @@ -76,5 +76,17 @@ glibc {
>        minval: 1
>        security_level: SXID_IGNORE
>      }
> +    tcache_max {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_MAX
> +    }
> +    tcache_count {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_COUNT
> +    }
> +    tcache_unsorted_limit {
> +      type: SIZE_T
> +      env_alias: MALLOC_TCACHE_UNSORTED_LIMIT
> +    }
>    }
>  }
> diff --git a/malloc/Makefile b/malloc/Makefile
> index e93b83b..dd8a43a 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -168,6 +168,9 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV)
>  tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3
>  tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV)
>
> +ifeq ($(experimental-malloc),yes)
> +CPPFLAGS-malloc.c += -DUSE_TCACHE
> +endif
>  # Uncomment this for test releases.  For public releases it is too expensive.
>  #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
>
> diff --git a/malloc/arena.c b/malloc/arena.c
> index d49e4a2..79e918f 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -236,6 +236,11 @@ DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t)
>  DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t)
> +#if USE_TCACHE
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t)
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
> +#endif
>  #else
>  /* Initialization routine. */
>  #include <string.h>
> @@ -322,6 +327,11 @@ ptmalloc_init (void)
>    TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max);
>    TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max);
>    TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test);
> +#if USE_TCACHE
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max);
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count);
> +  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit);
> +#endif
>    __libc_lock_unlock (main_arena.mutex);
>  #else
>    const char *s = NULL;
> @@ -371,7 +381,23 @@ ptmalloc_init (void)
>                    if (memcmp (envline, "ARENA_TEST", 10) == 0)
>                      __libc_mallopt (M_ARENA_TEST, atoi (&envline[11]));
>                  }
> +#if USE_TCACHE
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_MAX", 10) == 0)
> +                    __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11]));
> +                }
> +#endif
>                break;
> +#if USE_TCACHE
> +            case 12:
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_COUNT", 12) == 0)
> +                    __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13]));
> +                }
> +             break;
> +#endif
>              case 15:
>                if (!__builtin_expect (__libc_enable_secure, 0))
>                  {
> @@ -381,6 +407,15 @@ ptmalloc_init (void)
>                      __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16]));
>                  }
>                break;
> +#if USE_TCACHE
> +            case 21:
> +              if (!__builtin_expect (__libc_enable_secure, 0))
> +                {
> +                  if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0)
> +                    __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22]));
> +                }
> +             break;
> +#endif
>              default:
>                break;
>              }
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index e29105c..8cd03d8 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -297,6 +297,33 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line,
>  }
>  #endif
>
> +#ifndef USE_TCACHE
> +# define USE_TCACHE 0
> +#endif
> +#if USE_TCACHE
> +/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
> +# define MAX_TCACHE_SIZE       (MALLOC_ALIGNMENT * 63)
> +# define TCACHE_IDX            ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1)
> +# define size2tidx_(bytes)     (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
> +
> +# define tidx2csize(idx)       ((idx) * MALLOC_ALIGNMENT + SIZE_SZ)
> +# define tidx2usize(idx)       ((idx) * MALLOC_ALIGNMENT)
> +
> +/* When "x" is a user-provided size.  */
> +# define usize2tidx(x) size2tidx_ (x)
> +/* When "x" is from chunksize().  */
> +# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ)
> +
> +/* Rounds up, so...
> +   idx 0   bytes 0
> +   idx 1   bytes 1..8
> +   idx 2   bytes 9..16
> +   etc.  */
> +
> +/* This is another arbitrary limit, which tunables can change.  */
> +# define TCACHE_FILL_COUNT 7
> +#endif
> +
>
>  /*
>    REALLOC_ZERO_BYTES_FREES should be set if a call to
> @@ -1711,6 +1738,17 @@ struct malloc_par
>
>    /* First address handed out by MORECORE/sbrk.  */
>    char *sbrk_base;
> +
> +#if USE_TCACHE
> +  /* Maximum number of buckets to use.  */
> +  size_t tcache_max;
> +  size_t tcache_max_bytes;
> +  /* Maximum number of chunks in each bucket.  */
> +  size_t tcache_count;
> +  /* Maximum number of chunks to remove from the unsorted list, which
> +     don't match.  */
> +  size_t tcache_unsorted_limit;
> +#endif
>  };
>
>  /* There are several instances of this struct ("arenas") in this
> @@ -1749,8 +1787,22 @@ static struct malloc_par mp_ =
>    .trim_threshold = DEFAULT_TRIM_THRESHOLD,
>  #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
>    .arena_test = NARENAS_FROM_NCORES (1)
> +#if USE_TCACHE
> +  ,
> +  .tcache_count = TCACHE_FILL_COUNT,
> +  .tcache_max = TCACHE_IDX,
> +  .tcache_max_bytes = tidx2usize (TCACHE_IDX-1),
> +  .tcache_unsorted_limit = 0 /* No limit */
> +#endif
>  };
>
> +/*  Non public mallopt parameters.  */
> +#if USE_TCACHE
> +# define M_TCACHE_COUNT  -9
> +# define M_TCACHE_MAX  -10
> +# define M_TCACHE_UNSORTED_LIMIT  -11
> +#endif
> +
>  /* Maximum size of memory handled in fastbins.  */
>  static INTERNAL_SIZE_T global_max_fast;
>
> @@ -2874,6 +2926,106 @@ mremap_chunk (mchunkptr p, size_t new_size)
>
>  /*------------------------ Public wrappers. --------------------------------*/
>
> +#if USE_TCACHE
> +
> +typedef struct TCacheEntry {
> +  struct TCacheEntry *next;
> +} TCacheEntry;
> +
> +/* There is one of these for each thread, which contains the
> +   per-thread cache (hence "TCache").  Keeping overall size low is
> +   mildly important.  Note that COUNTS and ENTRIES are redundant, this
> +   is for performance reasons.  */
> +typedef struct TCache {
> +  char counts[TCACHE_IDX];
> +  TCacheEntry *entries[TCACHE_IDX];
> +} TCache;
> +
> +static __thread char tcache_shutting_down = 0;
> +static __thread TCache *tcache = NULL;
> +
> +static void
> +tcache_put (mchunkptr chunk, size_t tc_idx)
> +{
> +  TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk);
> +  e->next = tcache->entries[tc_idx];
> +  tcache->entries[tc_idx] = e;
> +  ++(tcache->counts[tc_idx]);
> +}
> +
> +static void *
> +tcache_get (size_t tc_idx)
> +{
> +  TCacheEntry *e = tcache->entries[tc_idx];
> +  tcache->entries[tc_idx] = e->next;
> +  --(tcache->counts[tc_idx]);
> +  return (void *) e;
> +}
> +
> +static void __attribute__ ((section ("__libc_thread_freeres_fn")))
> +tcache_thread_freeres (void)
> +{
> +  int i;
> +  TCache *tcache_tmp = tcache;
> +
> +  if (!tcache)
> +    return;
> +
> +  tcache = NULL;
> +
> +  for (i = 0; i < TCACHE_IDX; ++i) {
> +    while (tcache_tmp->entries[i])
> +      {
> +       TCacheEntry *e = tcache_tmp->entries[i];
> +       tcache_tmp->entries[i] = e->next;
> +       __libc_free (e);
> +      }
> +  }
> +
> +  __libc_free (tcache_tmp);
> +
> +  tcache_shutting_down = 1;
> +}
> +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
> +
> +static void
> +tcache_init(void)
> +{
> +  mstate ar_ptr;
> +  void *victim = 0;
> +  const size_t bytes = sizeof (TCache);
> +
> +  if (tcache_shutting_down)
> +    return;
> +
> +  arena_get (ar_ptr, bytes);
> +  victim = _int_malloc (ar_ptr, bytes);
> +  if (!victim && ar_ptr != NULL)
> +    {
> +      ar_ptr = arena_get_retry (ar_ptr, bytes);
> +      victim = _int_malloc (ar_ptr, bytes);
> +    }
> +
> +
> +  if (ar_ptr != NULL)
> +    __libc_lock_unlock (ar_ptr->mutex);
> +
> +  if (victim)
> +    {
> +      tcache = (TCache *) victim;
> +      memset (tcache, 0, sizeof (TCache));
> +    }
> +
> +}
> +
> +#define MAYBE_INIT_TCACHE() \
> +  if (__glibc_unlikely (tcache == NULL)) \
> +    tcache_init();
> +
> +#else
> +#define MAYBE_INIT_TCACHE()
> +#endif
> +
>  void *
>  __libc_malloc (size_t bytes)
>  {
> @@ -2884,6 +3036,21 @@ __libc_malloc (size_t bytes)
>      = atomic_forced_read (__malloc_hook);
>    if (__builtin_expect (hook != NULL, 0))
>      return (*hook)(bytes, RETURN_ADDRESS (0));
> +#if USE_TCACHE
> +  /* int_free also calls request2size, be careful to not pad twice.  */
> +  size_t tbytes = request2size (bytes);
> +  size_t tc_idx = csize2tidx (tbytes);
> +
> +  MAYBE_INIT_TCACHE ();
> +
> +  if (tc_idx < mp_.tcache_max
> +      && tc_idx < TCACHE_IDX /* to appease gcc */
> +      && tcache
> +      && tcache->entries[tc_idx] != NULL)
> +    {
> +      return tcache_get (tc_idx);
> +    }
> +#endif
>
>    arena_get (ar_ptr, bytes);
>
> @@ -2943,6 +3110,8 @@ __libc_free (void *mem)
>        return;
>      }
>
> +  MAYBE_INIT_TCACHE ();
> +
>    ar_ptr = arena_for_chunk (p);
>    _int_free (ar_ptr, p, 0);
>  }
> @@ -2980,7 +3149,10 @@ __libc_realloc (void *oldmem, size_t bytes)
>    if (chunk_is_mmapped (oldp))
>      ar_ptr = NULL;
>    else
> -    ar_ptr = arena_for_chunk (oldp);
> +    {
> +      MAYBE_INIT_TCACHE ();
> +      ar_ptr = arena_for_chunk (oldp);
> +    }
>
>    /* Little security check which won't hurt performance: the allocator
>       never wrapps around at the end of the address space.  Therefore
> @@ -3205,6 +3377,8 @@ __libc_calloc (size_t n, size_t elem_size)
>
>    sz = bytes;
>
> +  MAYBE_INIT_TCACHE ();
> +
>    arena_get (av, sz);
>    if (av)
>      {
> @@ -3335,6 +3509,10 @@ _int_malloc (mstate av, size_t bytes)
>    mchunkptr fwd;                    /* misc temp for linking */
>    mchunkptr bck;                    /* misc temp for linking */
>
> +#if USE_TCACHE
> +  size_t tcache_unsorted_count;            /* count of unsorted chunks processed */
> +#endif
> +
>    const char *errstr = NULL;
>
>    /*
> @@ -3387,6 +3565,35 @@ _int_malloc (mstate av, size_t bytes)
>                return NULL;
>              }
>            check_remalloced_chunk (av, victim, nb);
> +#if USE_TCACHE
> +         /* While we're here, if we see other chunks of the same size,
> +            stash them in the tcache.  */
> +         size_t tc_idx = csize2tidx (nb);
> +         if (tcache && tc_idx < mp_.tcache_max)
> +           {
> +             mchunkptr tc_victim;
> +             int found = 0;
> +
> +             /* While bin not empty and tcache not full, copy chunks over.  */
> +             while (tcache->counts[tc_idx] < mp_.tcache_count
> +                    && (pp = *fb) != NULL)
> +               {
> +                 do
> +                   {
> +                     tc_victim = pp;
> +                     if (tc_victim == NULL)
> +                       break;
> +                   }
> +                 while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim))
> +                        != tc_victim);
> +                 if (tc_victim != 0)
> +                   {
> +                     tcache_put (tc_victim, tc_idx);
> +                     ++found;
> +                   }
> +               }
> +           }
> +#endif
>            void *p = chunk2mem (victim);
>            alloc_perturb (p, bytes);
>            return p;
> @@ -3425,6 +3632,34 @@ _int_malloc (mstate av, size_t bytes)
>                if (av != &main_arena)
>                 set_non_main_arena (victim);
>                check_malloced_chunk (av, victim, nb);
> +#if USE_TCACHE
> +         /* While we're here, if we see other chunks of the same size,
> +            stash them in the tcache.  */
> +         size_t tc_idx = csize2tidx (nb);
> +         if (tcache && tc_idx < mp_.tcache_max)
> +           {
> +             mchunkptr tc_victim;
> +             int found = 0;
> +
> +             /* While bin not empty and tcache not full, copy chunks over.  */
> +             while (tcache->counts[tc_idx] < mp_.tcache_count
> +                    && (tc_victim = last (bin)) != bin)
> +               {
> +                 if (tc_victim != 0)
> +                   {
> +                     bck = tc_victim->bk;
> +                     set_inuse_bit_at_offset (tc_victim, nb);
> +                     if (av != &main_arena)
> +                       set_non_main_arena (tc_victim);
> +                     bin->bk = bck;
> +                     bck->fd = bin;
> +
> +                     tcache_put (tc_victim, tc_idx);
> +                     ++found;
> +                   }
> +               }
> +           }
> +#endif
>                void *p = chunk2mem (victim);
>                alloc_perturb (p, bytes);
>                return p;
> @@ -3463,6 +3698,16 @@ _int_malloc (mstate av, size_t bytes)
>       otherwise need to expand memory to service a "small" request.
>     */
>
> +#if USE_TCACHE
> +  INTERNAL_SIZE_T tcache_nb = 0;
> +  size_t tc_idx = csize2tidx (nb);
> +  if (tcache && tc_idx < mp_.tcache_max)
> +    tcache_nb = nb;
> +  int return_cached = 0;
> +
> +  tcache_unsorted_count = 0;
> +#endif
> +
>    for (;; )
>      {
>        int iters = 0;
> @@ -3523,10 +3768,26 @@ _int_malloc (mstate av, size_t bytes)
>                set_inuse_bit_at_offset (victim, size);
>                if (av != &main_arena)
>                 set_non_main_arena (victim);
> +#if USE_TCACHE
> +             /* Fill cache first, return to user only if cache fills.
> +                We may return one of these chunks later.  */
> +             if (tcache_nb
> +                 && tcache->counts[tc_idx] < mp_.tcache_count)
> +               {
> +                 tcache_put (victim, tc_idx);
> +                 return_cached = 1;
> +                 continue;
> +               }
> +             else
> +               {
> +#endif
>                check_malloced_chunk (av, victim, nb);
>                void *p = chunk2mem (victim);
>                alloc_perturb (p, bytes);
>                return p;
> +#if USE_TCACHE
> +               }
> +#endif
>              }
>
>            /* place chunk in bin */
> @@ -3593,11 +3854,31 @@ _int_malloc (mstate av, size_t bytes)
>            fwd->bk = victim;
>            bck->fd = victim;
>
> +#if USE_TCACHE
> +      /* If we've processed as many chunks as we're allowed while
> +        filling the cache, return one of the cached ones.  */
> +      ++tcache_unsorted_count;
> +      if (return_cached
> +         && mp_.tcache_unsorted_limit > 0
> +         && tcache_unsorted_count > mp_.tcache_unsorted_limit)
> +       {
> +         return tcache_get (tc_idx);
> +       }
> +#endif
> +
>  #define MAX_ITERS       10000
>            if (++iters >= MAX_ITERS)
>              break;
>          }
>
> +#if USE_TCACHE
> +      /* If all the small chunks we found ended up cached, return one now.  */
> +      if (return_cached)
> +       {
> +         return tcache_get (tc_idx);
> +       }
> +#endif
> +
>        /*
>           If a large request, scan through the chunks of current bin in
>           sorted order to find smallest that fits.  Use the skip list for this.
> @@ -3883,6 +4164,20 @@ _int_free (mstate av, mchunkptr p, int have_lock)
>
>    check_inuse_chunk(av, p);
>
> +#if USE_TCACHE
> +  {
> +    size_t tc_idx = csize2tidx (size);
> +
> +    if (tcache
> +       && tc_idx < mp_.tcache_max
> +       && tcache->counts[tc_idx] < mp_.tcache_count)
> +      {
> +       tcache_put (p, tc_idx);
> +       return;
> +      }
> +  }
> +#endif
> +
>    /*
>      If eligible, place chunk on a fastbin so it can be found
>      and used quickly in malloc.
> @@ -4844,6 +5139,38 @@ do_set_arena_max (size_t value)
>    return 1;
>  }
>
> +#if USE_TCACHE
> +static inline int
> +__always_inline
> +do_set_tcache_max (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
> +  if (value >= 0 && value <= MAX_TCACHE_SIZE)
> +    {
> +      mp_.tcache_max_bytes = value;
> +      mp_.tcache_max = usize2tidx (value) + 1;
> +    }
> +  return 1;
> +}
> +
> +static inline int
> +__always_inline
> +do_set_tcache_count (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count);
> +  mp_.tcache_count = value;
> +  return 1;
> +}
> +
> +static inline int
> +__always_inline
> +do_set_tcache_unsorted_limit (size_t value)
> +{
> +  LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);
> +  mp_.tcache_unsorted_limit = value;
> +  return 1;
> +}
> +#endif
>
>  int
>  __libc_mallopt (int param_number, int value)
> @@ -4904,6 +5231,20 @@ __libc_mallopt (int param_number, int value)
>        if (value > 0)
>         do_set_arena_test (value);
>        break;
> +#if USE_TCACHE
> +    case M_TCACHE_COUNT:
> +      if (value >= 0)
> +       do_set_tcache_count (value);
> +      break;
> +    case M_TCACHE_MAX:
> +      if (value >= 0)
> +       do_set_tcache_max (value);
> +      break;
> +    case M_TCACHE_UNSORTED_LIMIT:
> +      if (value >= 0)
> +       do_set_tcache_unsorted_limit (value);
> +      break;
> +#endif
>      }
>    __libc_lock_unlock (av->mutex);
>    return res;

Patch hide | download patch | download mbox

diff --git a/config.make.in b/config.make.in
index 5836b32..0290d83 100644
--- a/config.make.in
+++ b/config.make.in
@@ -77,6 +77,8 @@  multi-arch = @multi_arch@
 
 mach-interface-list = @mach_interface_list@
 
+experimental-malloc = @experimental_malloc@
+
 nss-crypt = @libc_cv_nss_crypt@
 static-nss-crypt = @libc_cv_static_nss_crypt@
 
diff --git a/configure.ac b/configure.ac
index 4a77411..b929012 100644
--- a/configure.ac
+++ b/configure.ac
@@ -313,6 +313,13 @@  AC_ARG_ENABLE([multi-arch],
 	      [multi_arch=$enableval],
 	      [multi_arch=default])
 
+AC_ARG_ENABLE([experimental-malloc],
+	      AC_HELP_STRING([--disable-experimental-malloc],
+			     [disable experimental malloc features]),
+	      [experimental_malloc=$enableval],
+	      [experimental_malloc=yes])
+AC_SUBST(experimental_malloc)
+
 AC_ARG_ENABLE([nss-crypt],
 	      AC_HELP_STRING([--enable-nss-crypt],
 			     [enable libcrypt to use nss]),
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index cb9e8f1..37620c8 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -76,5 +76,17 @@  glibc {
       minval: 1
       security_level: SXID_IGNORE
     }
+    tcache_max {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_MAX
+    }
+    tcache_count {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_COUNT
+    }
+    tcache_unsorted_limit {
+      type: SIZE_T
+      env_alias: MALLOC_TCACHE_UNSORTED_LIMIT
+    }
   }
 }
diff --git a/malloc/Makefile b/malloc/Makefile
index e93b83b..dd8a43a 100644
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -168,6 +168,9 @@  tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV)
 tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3
 tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV)
 
+ifeq ($(experimental-malloc),yes)
+CPPFLAGS-malloc.c += -DUSE_TCACHE
+endif
 # Uncomment this for test releases.  For public releases it is too expensive.
 #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
 
diff --git a/malloc/arena.c b/malloc/arena.c
index d49e4a2..79e918f 100644
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -236,6 +236,11 @@  DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t)
 DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t)
+#if USE_TCACHE
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t)
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
+DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
+#endif
 #else
 /* Initialization routine. */
 #include <string.h>
@@ -322,6 +327,11 @@  ptmalloc_init (void)
   TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max);
   TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max);
   TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test);
+#if USE_TCACHE
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max);
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count);
+  TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit);
+#endif
   __libc_lock_unlock (main_arena.mutex);
 #else
   const char *s = NULL;
@@ -371,7 +381,23 @@  ptmalloc_init (void)
                   if (memcmp (envline, "ARENA_TEST", 10) == 0)
                     __libc_mallopt (M_ARENA_TEST, atoi (&envline[11]));
                 }
+#if USE_TCACHE
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_MAX", 10) == 0)
+                    __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11]));
+                }
+#endif
               break;
+#if USE_TCACHE
+            case 12:
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_COUNT", 12) == 0)
+                    __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13]));
+                }
+	      break;
+#endif
             case 15:
               if (!__builtin_expect (__libc_enable_secure, 0))
                 {
@@ -381,6 +407,15 @@  ptmalloc_init (void)
                     __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16]));
                 }
               break;
+#if USE_TCACHE
+            case 21:
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0)
+                    __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22]));
+                }
+	      break;
+#endif
             default:
               break;
             }
diff --git a/malloc/malloc.c b/malloc/malloc.c
index e29105c..8cd03d8 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -297,6 +297,33 @@  __malloc_assert (const char *assertion, const char *file, unsigned int line,
 }
 #endif
 
+#ifndef USE_TCACHE
+# define USE_TCACHE 0
+#endif
+#if USE_TCACHE
+/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
+# define MAX_TCACHE_SIZE	(MALLOC_ALIGNMENT * 63)
+# define TCACHE_IDX		((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1)
+# define size2tidx_(bytes)	(((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
+
+# define tidx2csize(idx)	((idx) * MALLOC_ALIGNMENT + SIZE_SZ)
+# define tidx2usize(idx)	((idx) * MALLOC_ALIGNMENT)
+
+/* When "x" is a user-provided size.  */
+# define usize2tidx(x) size2tidx_ (x)
+/* When "x" is from chunksize().  */
+# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ)
+
+/* Rounds up, so...
+   idx 0   bytes 0
+   idx 1   bytes 1..8
+   idx 2   bytes 9..16
+   etc.  */
+
+/* This is another arbitrary limit, which tunables can change.  */
+# define TCACHE_FILL_COUNT 7
+#endif
+
 
 /*
   REALLOC_ZERO_BYTES_FREES should be set if a call to
@@ -1711,6 +1738,17 @@  struct malloc_par
 
   /* First address handed out by MORECORE/sbrk.  */
   char *sbrk_base;
+
+#if USE_TCACHE
+  /* Maximum number of buckets to use.  */
+  size_t tcache_max;
+  size_t tcache_max_bytes;
+  /* Maximum number of chunks in each bucket.  */
+  size_t tcache_count;
+  /* Maximum number of chunks to remove from the unsorted list, which
+     don't match.  */
+  size_t tcache_unsorted_limit;
+#endif
 };
 
 /* There are several instances of this struct ("arenas") in this
@@ -1749,8 +1787,22 @@  static struct malloc_par mp_ =
   .trim_threshold = DEFAULT_TRIM_THRESHOLD,
 #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
   .arena_test = NARENAS_FROM_NCORES (1)
+#if USE_TCACHE
+  ,
+  .tcache_count = TCACHE_FILL_COUNT,
+  .tcache_max = TCACHE_IDX,
+  .tcache_max_bytes = tidx2usize (TCACHE_IDX-1),
+  .tcache_unsorted_limit = 0 /* No limit */
+#endif
 };
 
+/*  Non public mallopt parameters.  */
+#if USE_TCACHE
+# define M_TCACHE_COUNT  -9
+# define M_TCACHE_MAX  -10
+# define M_TCACHE_UNSORTED_LIMIT  -11
+#endif
+
 /* Maximum size of memory handled in fastbins.  */
 static INTERNAL_SIZE_T global_max_fast;
 
@@ -2874,6 +2926,106 @@  mremap_chunk (mchunkptr p, size_t new_size)
 
 /*------------------------ Public wrappers. --------------------------------*/
 
+#if USE_TCACHE
+
+typedef struct TCacheEntry {
+  struct TCacheEntry *next;
+} TCacheEntry;
+
+/* There is one of these for each thread, which contains the
+   per-thread cache (hence "TCache").  Keeping overall size low is
+   mildly important.  Note that COUNTS and ENTRIES are redundant, this
+   is for performance reasons.  */
+typedef struct TCache {
+  char counts[TCACHE_IDX];
+  TCacheEntry *entries[TCACHE_IDX];
+} TCache;
+
+static __thread char tcache_shutting_down = 0;
+static __thread TCache *tcache = NULL;
+
+static void
+tcache_put (mchunkptr chunk, size_t tc_idx)
+{
+  TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk);
+  e->next = tcache->entries[tc_idx];
+  tcache->entries[tc_idx] = e;
+  ++(tcache->counts[tc_idx]);
+}
+
+static void *
+tcache_get (size_t tc_idx)
+{
+  TCacheEntry *e = tcache->entries[tc_idx];
+  tcache->entries[tc_idx] = e->next;
+  --(tcache->counts[tc_idx]);
+  return (void *) e;
+}
+
+static void __attribute__ ((section ("__libc_thread_freeres_fn")))
+tcache_thread_freeres (void)
+{
+  int i;
+  TCache *tcache_tmp = tcache;
+
+  if (!tcache)
+    return;
+
+  tcache = NULL;
+
+  for (i = 0; i < TCACHE_IDX; ++i) {
+    while (tcache_tmp->entries[i])
+      {
+	TCacheEntry *e = tcache_tmp->entries[i];
+	tcache_tmp->entries[i] = e->next;
+	__libc_free (e);
+      }
+  }
+
+  __libc_free (tcache_tmp);
+
+  tcache_shutting_down = 1;
+}
+text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
+
+static void
+tcache_init(void)
+{
+  mstate ar_ptr;
+  void *victim = 0;
+  const size_t bytes = sizeof (TCache);
+
+  if (tcache_shutting_down)
+    return;
+
+  arena_get (ar_ptr, bytes);
+  victim = _int_malloc (ar_ptr, bytes);
+  if (!victim && ar_ptr != NULL)
+    {
+      ar_ptr = arena_get_retry (ar_ptr, bytes);
+      victim = _int_malloc (ar_ptr, bytes);
+    }
+
+
+  if (ar_ptr != NULL)
+    __libc_lock_unlock (ar_ptr->mutex);
+
+  if (victim)
+    {
+      tcache = (TCache *) victim;
+      memset (tcache, 0, sizeof (TCache));
+    }
+
+}
+
+#define MAYBE_INIT_TCACHE() \
+  if (__glibc_unlikely (tcache == NULL)) \
+    tcache_init();
+
+#else
+#define MAYBE_INIT_TCACHE()
+#endif
+
 void *
 __libc_malloc (size_t bytes)
 {
@@ -2884,6 +3036,21 @@  __libc_malloc (size_t bytes)
     = atomic_forced_read (__malloc_hook);
   if (__builtin_expect (hook != NULL, 0))
     return (*hook)(bytes, RETURN_ADDRESS (0));
+#if USE_TCACHE
+  /* int_free also calls request2size, be careful to not pad twice.  */
+  size_t tbytes = request2size (bytes);
+  size_t tc_idx = csize2tidx (tbytes);
+
+  MAYBE_INIT_TCACHE ();
+
+  if (tc_idx < mp_.tcache_max
+      && tc_idx < TCACHE_IDX /* to appease gcc */
+      && tcache
+      && tcache->entries[tc_idx] != NULL)
+    {
+      return tcache_get (tc_idx);
+    }
+#endif
 
   arena_get (ar_ptr, bytes);
 
@@ -2943,6 +3110,8 @@  __libc_free (void *mem)
       return;
     }
 
+  MAYBE_INIT_TCACHE ();
+
   ar_ptr = arena_for_chunk (p);
   _int_free (ar_ptr, p, 0);
 }
@@ -2980,7 +3149,10 @@  __libc_realloc (void *oldmem, size_t bytes)
   if (chunk_is_mmapped (oldp))
     ar_ptr = NULL;
   else
-    ar_ptr = arena_for_chunk (oldp);
+    {
+      MAYBE_INIT_TCACHE ();
+      ar_ptr = arena_for_chunk (oldp);
+    }
 
   /* Little security check which won't hurt performance: the allocator
      never wrapps around at the end of the address space.  Therefore
@@ -3205,6 +3377,8 @@  __libc_calloc (size_t n, size_t elem_size)
 
   sz = bytes;
 
+  MAYBE_INIT_TCACHE ();
+
   arena_get (av, sz);
   if (av)
     {
@@ -3335,6 +3509,10 @@  _int_malloc (mstate av, size_t bytes)
   mchunkptr fwd;                    /* misc temp for linking */
   mchunkptr bck;                    /* misc temp for linking */
 
+#if USE_TCACHE
+  size_t tcache_unsorted_count;	    /* count of unsorted chunks processed */
+#endif
+
   const char *errstr = NULL;
 
   /*
@@ -3387,6 +3565,35 @@  _int_malloc (mstate av, size_t bytes)
               return NULL;
             }
           check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	  /* While we're here, if we see other chunks of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = csize2tidx (nb);
+	  if (tcache && tc_idx < mp_.tcache_max)
+	    {
+	      mchunkptr tc_victim;
+	      int found = 0;
+
+	      /* While bin not empty and tcache not full, copy chunks over.  */
+	      while (tcache->counts[tc_idx] < mp_.tcache_count
+		     && (pp = *fb) != NULL)
+		{
+		  do
+		    {
+	              tc_victim = pp;
+	              if (tc_victim == NULL)
+	                break;
+	            }
+	          while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim))
+	                 != tc_victim);
+		  if (tc_victim != 0)
+		    {
+		      tcache_put (tc_victim, tc_idx);
+		      ++found;
+	            }
+		}
+	    }
+#endif
           void *p = chunk2mem (victim);
           alloc_perturb (p, bytes);
           return p;
@@ -3425,6 +3632,34 @@  _int_malloc (mstate av, size_t bytes)
               if (av != &main_arena)
 		set_non_main_arena (victim);
               check_malloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	  /* While we're here, if we see other chunks of the same size,
+	     stash them in the tcache.  */
+	  size_t tc_idx = csize2tidx (nb);
+	  if (tcache && tc_idx < mp_.tcache_max)
+	    {
+	      mchunkptr tc_victim;
+	      int found = 0;
+
+	      /* While bin not empty and tcache not full, copy chunks over.  */
+	      while (tcache->counts[tc_idx] < mp_.tcache_count
+		     && (tc_victim = last (bin)) != bin)
+		{
+		  if (tc_victim != 0)
+		    {
+		      bck = tc_victim->bk;
+		      set_inuse_bit_at_offset (tc_victim, nb);
+		      if (av != &main_arena)
+			set_non_main_arena (tc_victim);
+		      bin->bk = bck;
+		      bck->fd = bin;
+
+		      tcache_put (tc_victim, tc_idx);
+		      ++found;
+	            }
+		}
+	    }
+#endif
               void *p = chunk2mem (victim);
               alloc_perturb (p, bytes);
               return p;
@@ -3463,6 +3698,16 @@  _int_malloc (mstate av, size_t bytes)
      otherwise need to expand memory to service a "small" request.
    */
 
+#if USE_TCACHE
+  INTERNAL_SIZE_T tcache_nb = 0;
+  size_t tc_idx = csize2tidx (nb);
+  if (tcache && tc_idx < mp_.tcache_max)
+    tcache_nb = nb;
+  int return_cached = 0;
+
+  tcache_unsorted_count = 0;
+#endif
+
   for (;; )
     {
       int iters = 0;
@@ -3523,10 +3768,26 @@  _int_malloc (mstate av, size_t bytes)
               set_inuse_bit_at_offset (victim, size);
               if (av != &main_arena)
 		set_non_main_arena (victim);
+#if USE_TCACHE
+	      /* Fill cache first, return to user only if cache fills.
+		 We may return one of these chunks later.  */
+	      if (tcache_nb
+		  && tcache->counts[tc_idx] < mp_.tcache_count)
+		{
+		  tcache_put (victim, tc_idx);
+		  return_cached = 1;
+		  continue;
+		}
+	      else
+		{
+#endif
               check_malloced_chunk (av, victim, nb);
               void *p = chunk2mem (victim);
               alloc_perturb (p, bytes);
               return p;
+#if USE_TCACHE
+		}
+#endif
             }
 
           /* place chunk in bin */
@@ -3593,11 +3854,31 @@  _int_malloc (mstate av, size_t bytes)
           fwd->bk = victim;
           bck->fd = victim;
 
+#if USE_TCACHE
+      /* If we've processed as many chunks as we're allowed while
+	 filling the cache, return one of the cached ones.  */
+      ++tcache_unsorted_count;
+      if (return_cached
+	  && mp_.tcache_unsorted_limit > 0
+	  && tcache_unsorted_count > mp_.tcache_unsorted_limit)
+	{
+	  return tcache_get (tc_idx);
+	}
+#endif
+
 #define MAX_ITERS       10000
           if (++iters >= MAX_ITERS)
             break;
         }
 
+#if USE_TCACHE
+      /* If all the small chunks we found ended up cached, return one now.  */
+      if (return_cached)
+	{
+	  return tcache_get (tc_idx);
+	}
+#endif
+
       /*
          If a large request, scan through the chunks of current bin in
          sorted order to find smallest that fits.  Use the skip list for this.
@@ -3883,6 +4164,20 @@  _int_free (mstate av, mchunkptr p, int have_lock)
 
   check_inuse_chunk(av, p);
 
+#if USE_TCACHE
+  {
+    size_t tc_idx = csize2tidx (size);
+
+    if (tcache
+	&& tc_idx < mp_.tcache_max
+	&& tcache->counts[tc_idx] < mp_.tcache_count)
+      {
+	tcache_put (p, tc_idx);
+	return;
+      }
+  }
+#endif
+
   /*
     If eligible, place chunk on a fastbin so it can be found
     and used quickly in malloc.
@@ -4844,6 +5139,38 @@  do_set_arena_max (size_t value)
   return 1;
 }
 
+#if USE_TCACHE
+static inline int
+__always_inline
+do_set_tcache_max (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
+  if (value >= 0 && value <= MAX_TCACHE_SIZE)
+    {
+      mp_.tcache_max_bytes = value;
+      mp_.tcache_max = usize2tidx (value) + 1;
+    }
+  return 1;
+}
+
+static inline int
+__always_inline
+do_set_tcache_count (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count);
+  mp_.tcache_count = value;
+  return 1;
+}
+
+static inline int
+__always_inline
+do_set_tcache_unsorted_limit (size_t value)
+{
+  LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);
+  mp_.tcache_unsorted_limit = value;
+  return 1;
+}
+#endif
 
 int
 __libc_mallopt (int param_number, int value)
@@ -4904,6 +5231,20 @@  __libc_mallopt (int param_number, int value)
       if (value > 0)
 	do_set_arena_test (value);
       break;
+#if USE_TCACHE
+    case M_TCACHE_COUNT:
+      if (value >= 0)
+	do_set_tcache_count (value);
+      break;
+    case M_TCACHE_MAX:
+      if (value >= 0)
+	do_set_tcache_max (value);
+      break;
+    case M_TCACHE_UNSORTED_LIMIT:
+      if (value >= 0)
+	do_set_tcache_unsorted_limit (value);
+      break;
+#endif
     }
   __libc_lock_unlock (av->mutex);
   return res;