diff mbox series

[v2,malloc] Use relaxed atomics for malloc have_fastchunks

Message ID DB6PR0801MB2053F5C639E8EBEE0E90CDB483660@DB6PR0801MB2053.eurprd08.prod.outlook.com
State New
Headers show
Series [v2,malloc] Use relaxed atomics for malloc have_fastchunks | expand

Commit Message

Wilco Dijkstra Sept. 21, 2017, 6:43 p.m. UTC
Currently free typically uses 2 atomic operations per call.  The have_fastchunks
flag indicates whether there are recently freed blocks in the fastbins.  This
is purely an optimization to avoid calling malloc_consolidate too often and
avoiding the overhead of walking all fast bins even if all are empty during a
sequence of allocations.  However using catomic_or to update the flag is completely
unnecessary since it can be changed into a simple boolean and accessed using
relaxed atomics.  There is no change in multi-threaded behaviour given the flag is
already approximate (it may be set when there are no blocks in any fast bins,
or it may be clear when there are free blocks that could be consolidated).

Performance of malloc/free improves by 27% on a simple benchmark on AArch64
(both single and multithreaded). The number of load/store exclusive instructions is
reduced by 33%. Bench-malloc-thread speeds up by ~3% in all cases.

Passes GLIBC tests. OK to commit?

ChangeLog:
2017-09-21  Wilco Dijkstra  <wdijkstr@arm.com>

	* malloc/malloc.c (FASTCHUNKS_BIT): Remove.
	(have_fastchunks): Remove.
	(clear_fastchunks): Remove.
	(set_fastchunks): Remove.
	(malloc_state): Add have_fastchunks.
	(malloc_init_state): Use have_fastchunks.
	(do_check_malloc_state): Remove incorrect invariant checks.
	(_int_malloc): Use have_fastchunks.
	(_int_free): Likewise.
	(malloc_consolidate): Likewise.

--

Comments

Wilco Dijkstra Sept. 26, 2017, 12:07 p.m. UTC | #1
DJ Delorie wrote:
    
> Results of benchmarks...  Note that dj2 is a synthetic test, so the
> slowdown is not surprising, and that all tests are averages of 16 runs
> except git-cinnabar-helper which is only two pristine and one patched
> (it takes 20 minutes per run and I got impatient ;).  Values are cycles,
> lower is better.
>
> Workload                      Pristine         Patched      
> 389ds                     9,121,687,695    8,017,021,813   87.89%
> dj2                       7,901,004,232    8,277,784,940  104.77%
> git-cinnabar-helper     100,123,244,269  90,058,896,622   89.95%
> okular-1                  3,648,656,309    3,220,751,900   88.27%
> oocalc                    1,053,984,703    1,009,859,213   95.81%
> qemu-virtio                 781,260,028      766,458,246   98.11%
> qemu-win7                   655,497,193      626,270,566   95.54%
> proprietary-2             2,112,159,165    1,977,684,058   93.63%
>                        
>                                                Mean     94.25%
>
> Patch looks good to me otherwise, caveat any futher complaints about
> concurrency ;-)

Thanks, that looks very good indeed! Are these all multithreaded?
I'm working on a few tweaks to improve single-threaded performance.

Btw have you tried running these traces with say:
export GLIBC_TUNABLES=glibc.malloc.tcache_count=100

It would be interesting to find out whether that (or even larger values)
helps your traces too like it does the benchmarks I've tried.

Wilco
Joseph Myers Oct. 17, 2017, 11:08 p.m. UTC | #2
I think this breaks the build for tilepro.  You're using atomic operations 
on a bool field, and tilepro only supports atomic operations on 4-byte 
objects and gives errors for other sizes.  Generic code should only use 
atomics on int-size / pointer-size objects unless it knows the particular 
architecture supports atomics of other sizes (e.g. through 
__HAVE_64B_ATOMICS).

In file included from ../include/atomic.h:50:0,
                 from malloc.c:216:
malloc.c: In function 'malloc_init_state':
../sysdeps/tile/tilepro/atomic-machine.h:72:7: error: call to '__atomic_error_bad_argument_size' declared with attribute warning: bad sizeof atomic argument [-Werror]
       __atomic_error_bad_argument_size ();                              \
       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../sysdeps/tile/tilepro/atomic-machine.h:78:3: note: in expansion of macro '__atomic_update'
   __atomic_update ((mem), 0, (newvalue))
   ^~~~~~~~~~~~~~~
../sysdeps/tile/tilepro/atomic-machine.h:94:40: note: in expansion of macro 'atomic_exchange_acq'
 #define atomic_store_relaxed(mem, val) atomic_exchange_acq (mem, val)
                                        ^~~~~~~~~~~~~~~~~~~
malloc.c:1818:3: note: in expansion of macro 'atomic_store_relaxed'
   atomic_store_relaxed (&av->have_fastchunks, false);
   ^~~~~~~~~~~~~~~~~~~~
Wilco Dijkstra Oct. 18, 2017, 11:37 a.m. UTC | #3
Joseph Myers wrote:
>
> I think this breaks the build for tilepro.  You're using atomic operations
> on a bool field, and tilepro only supports atomic operations on 4-byte 
> objects and gives errors for other sizes.  Generic code should only use 
> atomics on int-size / pointer-size objects unless it knows the particular 
> architecture supports atomics of other sizes (e.g. through 
> __HAVE_64B_ATOMICS).

I thought there were only issues with types >= 64 bits...
Anyway I've updated the type to int - we can always decide not to use
atomics for this particular variable. I've committed this which fixes
tilepro:

2017-10-18  Wilco Dijkstra  <wdijkstr@arm.com>

        * malloc/malloc.c (malloc_state): Use int for have_fastchunks since
        not all targets support atomics on bool.

diff --git a/malloc/malloc.c b/malloc/malloc.c
index 51db44f..6b78968 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1673,7 +1673,8 @@ struct malloc_state
   int flags;
 
   /* Set if the fastbin chunks contain recently inserted free blocks.  */
-  bool have_fastchunks;
+  /* Note this is a bool but not all targets support atomics on booleans.  */
+  int have_fastchunks;
 
   /* Fastbins */
   mfastbinptr fastbinsY[NFASTBINS];
Chris Metcalf Oct. 18, 2017, 3:55 p.m. UTC | #4
On 10/17/2017 7:08 PM, Joseph Myers wrote:
> I think this breaks the build for tilepro.  You're using atomic 
> operations
> on a bool field, and tilepro only supports atomic operations on 4-byte
> objects and gives errors for other sizes.  Generic code should only use
> atomics on int-size / pointer-size objects unless it knows the particular
> architecture supports atomics of other sizes (e.g. through
> __HAVE_64B_ATOMICS).

To be fair, we could also synthesize subword atomics for tilepro with 
compare
and swap on the surrounding word, so it's not technically impossible to add
this support. And (as Wilco already demonstrated) it's trivial to fix the
caller to work around this problem.

However, it's also true that tilepro is no longer an actively-maintained 
platform;
the tilegx platform has pretty well supplanted it in practice. Perhaps 
it is time
to look at obsoleting tilepro so that its presence in the tree doesn't 
block
forward progress for other architectures.  I'm not aware of any existing 
tilepro
customers that are looking to use newer versions of glibc.
Joseph Myers Oct. 18, 2017, 4:51 p.m. UTC | #5
On Wed, 18 Oct 2017, Chris Metcalf wrote:

> To be fair, we could also synthesize subword atomics for tilepro with compare
> and swap on the surrounding word, so it's not technically impossible to add
> this support. And (as Wilco already demonstrated) it's trivial to fix the
> caller to work around this problem.

I'm pretty sure other architectures in glibc would also require subword 
atomics to be synthesized, and some certainly have errors for them at 
present - what's unique about tilepro is that the checks trigger on 
subword relaxed atomic load/store (whereas more architectures would have 
problems with non-relaxed atomics, compre/exchange, etc.).

> However, it's also true that tilepro is no longer an actively-maintained 
> platform; the tilegx platform has pretty well supplanted it in practice. 
> Perhaps it is time to look at obsoleting tilepro so that its presence in 
> the tree doesn't block forward progress for other architectures.  I'm 
> not aware of any existing tilepro customers that are looking to use 
> newer versions of glibc.

Certainly architecture maintainers can propose removal / obsoletion of 
their architectures (or subarchitecture variants, etc.), and in the 
absence of other people expressing interest in using that architecture I'd 
expect such removal / obsoletion proposals to achieve consensus.
diff mbox series

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index 1c2a0b05b78c84cea60ee998108180d51b1f1ddf..082c2b927727bff441cf48744265628d0bc40add 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1604,27 +1604,6 @@  typedef struct malloc_chunk *mfastbinptr;
 #define FASTBIN_CONSOLIDATION_THRESHOLD  (65536UL)
 
 /*
-   Since the lowest 2 bits in max_fast don't matter in size comparisons,
-   they are used as flags.
- */
-
-/*
-   FASTCHUNKS_BIT held in max_fast indicates that there are probably
-   some fastbin chunks. It is set true on entering a chunk into any
-   fastbin, and cleared only in malloc_consolidate.
-
-   The truth value is inverted so that have_fastchunks will be true
-   upon startup (since statics are zero-filled), simplifying
-   initialization checks.
- */
-
-#define FASTCHUNKS_BIT        (1U)
-
-#define have_fastchunks(M)     (((M)->flags & FASTCHUNKS_BIT) == 0)
-#define clear_fastchunks(M)    catomic_or (&(M)->flags, FASTCHUNKS_BIT)
-#define set_fastchunks(M)      catomic_and (&(M)->flags, ~FASTCHUNKS_BIT)
-
-/*
    NONCONTIGUOUS_BIT indicates that MORECORE does not return contiguous
    regions.  Otherwise, contiguity is exploited in merging together,
    when possible, results from consecutive MORECORE calls.
@@ -1672,6 +1651,17 @@  get_max_fast (void)
    ----------- Internal state representation and initialization -----------
  */
 
+/*
+   have_fastchunks indicates that there are probably some fastbin chunks.
+   It is set true on entering a chunk into any fastbin, and cleared early in
+   malloc_consolidate.  The value is approximate since it may be set when there
+   are no fastbin chunks, or it may be clear even if there are fastbin chunks
+   available.  Given it's sole purpose is to reduce number of redundant calls to
+   malloc_consolidate, it does not affect correctness.  As a result we can safely
+   use relaxed atomic accesses.
+ */
+
+
 struct malloc_state
 {
   /* Serialize access.  */
@@ -1680,6 +1670,9 @@  struct malloc_state
   /* Flags (formerly in max_fast).  */
   int flags;
 
+  /* Set if the fastbin chunks contain recently inserted free blocks.  */
+  bool have_fastchunks;
+
   /* Fastbins */
   mfastbinptr fastbinsY[NFASTBINS];
 
@@ -1823,7 +1816,7 @@  malloc_init_state (mstate av)
   set_noncontiguous (av);
   if (av == &main_arena)
     set_max_fast (DEFAULT_MXFAST);
-  av->flags |= FASTCHUNKS_BIT;
+  atomic_store_relaxed (&av->have_fastchunks, false);
 
   av->top = initial_top (av);
 }
@@ -2179,11 +2172,6 @@  do_check_malloc_state (mstate av)
         }
     }
 
-  if (total != 0)
-    assert (have_fastchunks (av));
-  else if (!have_fastchunks (av))
-    assert (total == 0);
-
   /* check normal bins */
   for (i = 1; i < NBINS; ++i)
     {
@@ -3650,7 +3638,7 @@  _int_malloc (mstate av, size_t bytes)
   else
     {
       idx = largebin_index (nb);
-      if (have_fastchunks (av))
+      if (atomic_load_relaxed (&av->have_fastchunks))
         malloc_consolidate (av);
     }
 
@@ -4058,7 +4046,7 @@  _int_malloc (mstate av, size_t bytes)
 
       /* When we are using atomic ops to free fast chunks we can get
          here for all block sizes.  */
-      else if (have_fastchunks (av))
+      else if (atomic_load_relaxed (&av->have_fastchunks))
         {
           malloc_consolidate (av);
           /* restore original bin index */
@@ -4163,7 +4151,7 @@  _int_free (mstate av, mchunkptr p, int have_lock)
 
     free_perturb (chunk2mem(p), size - 2 * SIZE_SZ);
 
-    set_fastchunks(av);
+    atomic_store_relaxed (&av->have_fastchunks, true);
     unsigned int idx = fastbin_index(size);
     fb = &fastbin (av, idx);
 
@@ -4291,7 +4279,7 @@  _int_free (mstate av, mchunkptr p, int have_lock)
     */
 
     if ((unsigned long)(size) >= FASTBIN_CONSOLIDATION_THRESHOLD) {
-      if (have_fastchunks(av))
+      if (atomic_load_relaxed (&av->have_fastchunks))
 	malloc_consolidate(av);
 
       if (av == &main_arena) {
@@ -4360,7 +4348,7 @@  static void malloc_consolidate(mstate av)
   */
 
   if (get_max_fast () != 0) {
-    clear_fastchunks(av);
+    atomic_store_relaxed (&av->have_fastchunks, false);
 
     unsorted_bin = unsorted_chunks(av);