Message ID | 87va52nupb.fsf@oldenburg.str.redhat.com |
---|---|
State | New |
Headers | show |
Series | malloc: Use current (C11-style) atomics for fastbin access | expand |
Hi Florian, > This is another cleanup patch in preparation of the extended heap > protector (which will cover the fd/bk/fd_nextsize/bk_nextsize fields > in struct malloc_chunk, too). > > By the way, there is an optimization opportunity for the tcache > backfill in _int_malloc: After the acquire MO load on the fastbin > list head, we can traverse the fastbin list as far as we need in > order to refill the tcache, and update the new list head with a > single CAS. This does not have races (ABA races and the like) > because we have acquired the arena lock at this point. Some backoff > is probably needed in case the fastbin list head is contended. But > it is probably a good idea to do the first traversal at least once. I see a 16% regression on ppc64le with a simple threaded malloc test case. I guess the C11 atomics aren't as good as what we have in glibc. Thanks, Anton -- #include <malloc.h> #include <pthread.h> #include <unistd.h> #define SIZE 8 #define BATCH 128 #define ITERATIONS (50000000/BATCH) void *ptrs[BATCH]; static void __attribute__((noinline)) do_one(void) { for (unsigned long i = 0; i < BATCH; i++) ptrs[i] = malloc(SIZE); asm volatile("# %0": : "m"(ptrs[0])); for (unsigned long i = 0; i < BATCH; i++) free(ptrs[i]); } static void *doit(void *junk) { while (1) sleep(3600); return NULL; } int main(void) { unsigned int i; pthread_t tid; pthread_create(&tid, NULL, doit, NULL); for (i = 0; i < ITERATIONS; i++) do_one(); return 0; } > Thanks, > Florian > > 2018-11-12 Florian Weimer <fweimer@redhat.com> > > * malloc/malloc.c (fastbin_push_entry): New function. > (fastbin_pop_entry): Likewise. Replaces REMOVE_FB. > (REMOVE_FB): Remove macro. > (_int_malloc): Use fastbin_pop_entry and reindent. > (_int_free): Use fastbin_push_entry. > (malloc_consolidate): Use atomic_exchange_acquire. > > diff --git a/malloc/malloc.c b/malloc/malloc.c > index bfc605aa3e..7c2186c307 100644 > --- a/malloc/malloc.c > +++ b/malloc/malloc.c > @@ -1316,6 +1316,77 @@ nextchunk-> > +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ > #define set_foot(p, s) (((mchunkptr) ((char *) (p) + > (s)))->mchunk_prev_size = (s)) > +/* Add an item to the atomic fastbin list at *ROOT. Returns the old > + value at *ROOT. Note that properties of the old chunk are only > + stable if the caller has acquired the arena lock. With out the > + lock, it can be deallocated at any time. */ > +static inline struct malloc_chunk * > +fastbin_push_entry (struct malloc_chunk **root, struct malloc_chunk > *e) +{ > + struct malloc_chunk *head; > + if (SINGLE_THREAD_P) > + { > + /* Check that the top of the bin is not the record we are going > + to add (i.e., double free). */ > + head = *root; > + if (head == e) > + malloc_printerr ("double free or corruption (fasttop)"); > + e->fd = head; > + *root = e; > + } > + else > + do > + { > + /* Synchronize with the release release MO CAS below. We do > + not need synchronization locally, but fastbin_pop_entry > and > + (especially) malloc_consolidate read the entire list after > + synchronizing on the head, so we need to make sure that > the > + writes to the next (fd) pointers have happened. */ > + head = atomic_load_acquire (root); > + /* Check that the top of the bin is not the record we are > + going to add (i.e., double free). */ > + if (head == e) > + malloc_printerr ("double free or corruption (fasttop)"); > + e->fd = head; > + } > + /* Synchronizes with the acquire MO CAS in */ > + while (!atomic_compare_exchange_weak_release (root, &head, e)); > + return head; > +} > + > +/* Remove an item from the atomic fastbin list at *ROOT. The caller > + must have acquired the arena lock. */ > +static inline struct malloc_chunk * > +fastbin_pop_entry (struct malloc_chunk **root) > +{ > + struct malloc_chunk *head; > + if (SINGLE_THREAD_P) > + { > + head = *root; > + if (head != NULL) > + *root = head->fd; > + } > + else > + { > + /* Synchromizes with the release MO store in > fastbin_push_entry. > + Synchronization is needed because we read the next list > + pointer. */ > + head = atomic_load_acquire (root); > + struct malloc_chunk *tail; > + do > + if (head == NULL) > + return NULL; > + else > + tail = head->fd; > + /* Synchronizes with the release MO store in > fastbin_push_entry. > + We do not have an ABA issue here because the caller has > + acquired the arena lock, which ensures that there is only > one > + thread which removes elements from this list. */ > + while (!atomic_compare_exchange_weak_acquire (root, &head, > tail)); > + } > + return head; > +} > + > #pragma GCC poison mchunk_size > #pragma GCC poison mchunk_prev_size > > @@ -3559,63 +3630,36 @@ _int_malloc (mstate av, size_t bytes) > can try it without checking, which saves some time on this fast > path. */ > > -#define REMOVE_FB(fb, victim, pp) \ > - do \ > - { \ > - victim = pp; \ > - if (victim == NULL) \ > - break; \ > - } \ > - while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, > victim)) \ > - != victim); \ > - > if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ())) > { > idx = fastbin_index (nb); > mfastbinptr *fb = &fastbin (av, idx); > - mchunkptr pp; > - victim = *fb; > - > + victim = fastbin_pop_entry (fb); > if (victim != NULL) > { > - if (SINGLE_THREAD_P) > - *fb = victim->fd; > - else > - REMOVE_FB (fb, pp, victim); > - if (__glibc_likely (victim != NULL)) > - { > - size_t victim_idx = fastbin_index (chunksize (victim)); > - if (__builtin_expect (victim_idx != idx, 0)) > - malloc_printerr ("malloc(): memory corruption > (fast)"); > - check_remalloced_chunk (av, victim, nb); > + size_t victim_idx = fastbin_index (chunksize (victim)); > + if (victim_idx != idx) > + malloc_printerr ("malloc(): memory corruption (fast)"); > + check_remalloced_chunk (av, victim, nb); > #if USE_TCACHE > - /* While we're here, if we see other chunks of the > same size, > - stash them in the tcache. */ > - size_t tc_idx = csize2tidx (nb); > - if (tcache && tc_idx < mp_.tcache_bins) > + /* While we're here, if we see other chunks of the same > size, > + stash them in the tcache. */ > + size_t tc_idx = csize2tidx (nb); > + if (tcache && tc_idx < mp_.tcache_bins) > + { > + /* While bin not empty and tcache not full, copy > chunks. */ > + while (tcache->counts[tc_idx] < mp_.tcache_count) > { > - mchunkptr tc_victim; > - > - /* While bin not empty and tcache not full, copy > chunks. */ > - while (tcache->counts[tc_idx] < mp_.tcache_count > - && (tc_victim = *fb) != NULL) > - { > - if (SINGLE_THREAD_P) > - *fb = tc_victim->fd; > - else > - { > - REMOVE_FB (fb, pp, tc_victim); > - if (__glibc_unlikely (tc_victim == NULL)) > - break; > - } > - tcache_put (tc_victim, tc_idx); > - } > + mchunkptr tc_victim = fastbin_pop_entry (fb); > + if (tc_victim == NULL) > + break; > + tcache_put (tc_victim, tc_idx); > } > -#endif > - void *p = chunk2mem (victim); > - alloc_perturb (p, bytes); > - return p; > } > +#endif > + void *p = chunk2mem (victim); > + alloc_perturb (p, bytes); > + return p; > } > } > > @@ -4227,28 +4271,7 @@ _int_free (mstate av, mchunkptr p, int > have_lock) fb = &fastbin (av, idx); > > /* Atomically link P to its fastbin: P->FD = *FB; *FB = P; */ > - mchunkptr old = *fb, old2; > - > - if (SINGLE_THREAD_P) > - { > - /* Check that the top of the bin is not the record we are > going to > - add (i.e., double free). */ > - if (__builtin_expect (old == p, 0)) > - malloc_printerr ("double free or corruption (fasttop)"); > - p->fd = old; > - *fb = p; > - } > - else > - do > - { > - /* Check that the top of the bin is not the record we are > going to > - add (i.e., double free). */ > - if (__builtin_expect (old == p, 0)) > - malloc_printerr ("double free or corruption (fasttop)"); > - p->fd = old2 = old; > - } > - while ((old = catomic_compare_and_exchange_val_rel (fb, p, > old2)) > - != old2); > + mchunkptr old = fastbin_push_entry (fb, p); > > /* Check that size of fastbin chunk at the top is the same as > size of the chunk that we are adding. We can dereference OLD > @@ -4439,7 +4462,9 @@ static void malloc_consolidate(mstate av) > maxfb = &fastbin (av, NFASTBINS - 1); > fb = &fastbin (av, 0); > do { > - p = atomic_exchange_acq (fb, NULL); > + /* Synchronizes with the release MO store in > + fastbin_push_entry. */ > + p = atomic_exchange_acquire (fb, NULL); > if (p != 0) { > do { > {
* Anton Blanchard: > Hi Florian, > >> This is another cleanup patch in preparation of the extended heap >> protector (which will cover the fd/bk/fd_nextsize/bk_nextsize fields >> in struct malloc_chunk, too). >> >> By the way, there is an optimization opportunity for the tcache >> backfill in _int_malloc: After the acquire MO load on the fastbin >> list head, we can traverse the fastbin list as far as we need in >> order to refill the tcache, and update the new list head with a >> single CAS. This does not have races (ABA races and the like) >> because we have acquired the arena lock at this point. Some backoff >> is probably needed in case the fastbin list head is contended. But >> it is probably a good idea to do the first traversal at least once. > > I see a 16% regression on ppc64le with a simple threaded malloc test > case. I guess the C11 atomics aren't as good as what we have in glibc. Uh-oh. Would you please check if replacing the two atomic_load_acquire with atomic_load_relaxed restore the previous performance? I believe both loads need to be acquire MO under the C11 memory model (see the comments why), but the old code did not have them. Thanks, Florian
Hi Florian, > > I see a 16% regression on ppc64le with a simple threaded malloc test > > case. I guess the C11 atomics aren't as good as what we have in > > glibc. > > Uh-oh. Would you please check if replacing the two > atomic_load_acquire with atomic_load_relaxed restore the previous > performance? As you suspect, doing this does restore the performance. The two lwsync barrier instructions must be causing the slow down. > I believe both loads need to be acquire MO under the C11 memory model > (see the comments why), but the old code did not have them. Ok, thanks for looking into it. Thanks, Anton
* Anton Blanchard: > Hi Florian, > >> > I see a 16% regression on ppc64le with a simple threaded malloc test >> > case. I guess the C11 atomics aren't as good as what we have in >> > glibc. >> >> Uh-oh. Would you please check if replacing the two >> atomic_load_acquire with atomic_load_relaxed restore the previous >> performance? > > As you suspect, doing this does restore the performance. The two lwsync > barrier instructions must be causing the slow down. Okay, I'll post a patch to revert that commit. However, the old code had this in _int_malloc (where the arena lock is acquired): #define REMOVE_FB(fb, victim, pp) \ do \ { \ victim = pp; \ if (victim == NULL) \ break; \ } \ while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ != victim); \ … REMOVE_FB (fb, pp, victim); And this in _int_free (without the arena lock): do { /* Check that the top of the bin is not the record we are going to add (i.e., double free). */ if (__builtin_expect (old == p, 0)) malloc_printerr ("double free or corruption (fasttop)"); p->fd = old2 = old; } while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) != old2); I really don't see what makes sure that the store of p->fd happens before the load of victim->fd. It works out on POWER for some reason. I'm attaching a test case that should exercise these two code paths. Thanks, Florian /* Run with: GLIBC_TUNABLES=glibc.malloc.tcache_count=0 */ #include <err.h> #include <errno.h> #include <pthread.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> static void usage (char **argv) { fprintf (stderr, "usage: %s THREADS FREES-PER-THREAD\n", argv[0]); exit (1); } static void check (const char *function, int ret) { if (ret != 0) { errno = ret; err (1, "%s", function); } } static pthread_barrier_t barrier; static void xpthread_barrier_wait (pthread_barrier_t *barrier) { int ret = pthread_barrier_wait (barrier); if (ret != 0 && ret != PTHREAD_BARRIER_SERIAL_THREAD) { errno = ret; err (1, "pthread_barrier_wait"); } } static int per_thread; static void * thread_routine (void *closure) { char **allocations = closure; int count = per_thread; while (true) { /* Wait for the main thread to perform allocations. */ xpthread_barrier_wait (&barrier); for (int i = 0; i < count; ++i) { if (allocations[i] == NULL) errx (1, "allocation is NULL"); free (allocations[i]); } /* Signal to the main thread that new allocations can be performed. */ xpthread_barrier_wait (&barrier); } } static void make_allocations (char **allocations, int count) { for (int i = 0; i < count; ++i) { size_t size = 120; /* 128 byte cache line, 8 bytes malloc overhead. */ allocations[i] = malloc (size); if (allocations[i] == NULL) err (1, "malloc"); memset (allocations[i], 0xc0, size); } } int main (int argc, char **argv) { if (argc != 3) usage (argv); int thread_count = atoi (argv[1]); if (thread_count <= 0) usage (argv); per_thread = atoi (argv[2]); if (per_thread <= 0) usage (argv); check ("pthread_barrier_init", pthread_barrier_init (&barrier, NULL, thread_count + 1)); pthread_attr_t attr; check ("pthread_attr_init", pthread_attr_init (&attr)); check ("pthread_attr_setstacksize", pthread_attr_setstacksize (&attr, 256 * 1024)); pthread_t *threads = calloc (thread_count, sizeof (*threads)); if (threads == NULL) err (1, "calloc"); char **allocations = calloc (thread_count * per_thread, sizeof (*allocations)); if (allocations == NULL) err (1, "calloc"); char **next_allocations = calloc (thread_count * per_thread, sizeof (*allocations)); if (next_allocations == NULL) err (1, "calloc"); for (int i = 0; i < thread_count; ++i) check ("pthread_create", pthread_create (threads + i, &attr, thread_routine, allocations + i * per_thread)); make_allocations (allocations, thread_count * per_thread); while (true) { xpthread_barrier_wait (&barrier); /* Threads perform deallocations here. */ make_allocations (next_allocations, thread_count * per_thread); xpthread_barrier_wait (&barrier); memcpy (allocations, next_allocations, thread_count * per_thread * sizeof (*allocations)); } }
On 1/16/19 7:30 AM, Florian Weimer wrote: > * Anton Blanchard: > >> Hi Florian, >> >>>> I see a 16% regression on ppc64le with a simple threaded malloc test >>>> case. I guess the C11 atomics aren't as good as what we have in >>>> glibc. >>> >>> Uh-oh. Would you please check if replacing the two >>> atomic_load_acquire with atomic_load_relaxed restore the previous >>> performance? >> >> As you suspect, doing this does restore the performance. The two lwsync >> barrier instructions must be causing the slow down. > > Okay, I'll post a patch to revert that commit. > > However, the old code had this in _int_malloc (where the arena lock is > acquired): > > #define REMOVE_FB(fb, victim, pp) \ > do \ > { \ > victim = pp; \ > if (victim == NULL) \ > break; \ > } \ > while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ > != victim); \ > … > REMOVE_FB (fb, pp, victim); > > And this in _int_free (without the arena lock): > > do > { > /* Check that the top of the bin is not the record we are going to > add (i.e., double free). */ > if (__builtin_expect (old == p, 0)) > malloc_printerr ("double free or corruption (fasttop)"); > p->fd = old2 = old; > } > while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) > != old2); > > I really don't see what makes sure that the store of p->fd happens > before the load of victim->fd. > > It works out on POWER for some reason. I'm attaching a test case that > should exercise these two code paths. This is still not a root cause analysis of the problem. I think it is OK to revert your patch under the pressure of the upcoming release and the fact that this is a serious performance regression. However, there is some due diligence that needs to be done here, and I think the patch should *right back in* as soon as master opens. We already have *lots* of other code using C11-style atomics in glibc, and we want to make sure those work as best we can. This issue is either one of two things: * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can eventually become corrupted or worse see unbounded growth as we corrupt the chunk list. or * non-C11 atomics are more optimal for C11 atomics and we can adjust the C11 atomics to be as good as the non-C11 versions with a little help from Arm and IBM. We have to be able to prove the fastbin access is correct.
* Carlos O'Donell: > On 1/16/19 7:30 AM, Florian Weimer wrote: >> * Anton Blanchard: >> >>> Hi Florian, >>> >>>>> I see a 16% regression on ppc64le with a simple threaded malloc test >>>>> case. I guess the C11 atomics aren't as good as what we have in >>>>> glibc. >>>> >>>> Uh-oh. Would you please check if replacing the two >>>> atomic_load_acquire with atomic_load_relaxed restore the previous >>>> performance? >>> >>> As you suspect, doing this does restore the performance. The two lwsync >>> barrier instructions must be causing the slow down. >> >> Okay, I'll post a patch to revert that commit. >> >> However, the old code had this in _int_malloc (where the arena lock is >> acquired): >> >> #define REMOVE_FB(fb, victim, pp) \ >> do \ >> { \ >> victim = pp; \ >> if (victim == NULL) \ >> break; \ >> } \ >> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ >> != victim); \ >> … >> REMOVE_FB (fb, pp, victim); >> >> And this in _int_free (without the arena lock): >> >> do >> { >> /* Check that the top of the bin is not the record we are going to >> add (i.e., double free). */ >> if (__builtin_expect (old == p, 0)) >> malloc_printerr ("double free or corruption (fasttop)"); >> p->fd = old2 = old; >> } >> while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) >> != old2); >> >> I really don't see what makes sure that the store of p->fd happens >> before the load of victim->fd. >> >> It works out on POWER for some reason. I'm attaching a test case that >> should exercise these two code paths. > > This is still not a root cause analysis of the problem. We know the root of the performance problem, the additional synchronization, and, on Aarch64, a consequence of what was supposed to be a minor algorithmic change. > I think it is OK to revert your patch under the pressure of the upcoming > release and the fact that this is a serious performance regression. > > However, there is some due diligence that needs to be done here, and I > think the patch should *right back in* as soon as master opens. We already > have *lots* of other code using C11-style atomics in glibc, and we want > to make sure those work as best we can. This issue is either one of two > things: > > * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can > eventually become corrupted or worse see unbounded growth as we > corrupt the chunk list. > > or > > * non-C11 atomics are more optimal for C11 atomics and we can adjust > the C11 atomics to be as good as the non-C11 versions with a little > help from Arm and IBM. It's also possible there's a different algorithm that's fast to implement with C11 atomics. I'm not a concurrency expert, but I really don't see a way to avoid the additional barriers with the current algorithm. It would also be worth checking if fastbins are even a win on these architectures. Thanks, Florian
On 1/16/19 11:04 AM, Florian Weimer wrote: > * Carlos O'Donell: > >> On 1/16/19 7:30 AM, Florian Weimer wrote: >>> * Anton Blanchard: >>> >>>> Hi Florian, >>>> >>>>>> I see a 16% regression on ppc64le with a simple threaded malloc test >>>>>> case. I guess the C11 atomics aren't as good as what we have in >>>>>> glibc. >>>>> >>>>> Uh-oh. Would you please check if replacing the two >>>>> atomic_load_acquire with atomic_load_relaxed restore the previous >>>>> performance? >>>> >>>> As you suspect, doing this does restore the performance. The two lwsync >>>> barrier instructions must be causing the slow down. >>> >>> Okay, I'll post a patch to revert that commit. >>> >>> However, the old code had this in _int_malloc (where the arena lock is >>> acquired): >>> >>> #define REMOVE_FB(fb, victim, pp) \ >>> do \ >>> { \ >>> victim = pp; \ >>> if (victim == NULL) \ >>> break; \ >>> } \ >>> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ >>> != victim); \ >>> … >>> REMOVE_FB (fb, pp, victim); >>> >>> And this in _int_free (without the arena lock): >>> >>> do >>> { >>> /* Check that the top of the bin is not the record we are going to >>> add (i.e., double free). */ >>> if (__builtin_expect (old == p, 0)) >>> malloc_printerr ("double free or corruption (fasttop)"); >>> p->fd = old2 = old; >>> } >>> while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) >>> != old2); >>> >>> I really don't see what makes sure that the store of p->fd happens >>> before the load of victim->fd. >>> >>> It works out on POWER for some reason. I'm attaching a test case that >>> should exercise these two code paths. >> >> This is still not a root cause analysis of the problem. > > We know the root of the performance problem, the additional > synchronization, and, on Aarch64, a consequence of what was supposed to > be a minor algorithmic change. OK, I hadn't seen an analysis of the instruction sequences, so I wasn't sure if this was completely understood. This begs the question: Is the existing code correct for Aarch64 and POWER64? I guess we don't know without serious review. >> I think it is OK to revert your patch under the pressure of the upcoming >> release and the fact that this is a serious performance regression. >> >> However, there is some due diligence that needs to be done here, and I >> think the patch should *right back in* as soon as master opens. We already >> have *lots* of other code using C11-style atomics in glibc, and we want >> to make sure those work as best we can. This issue is either one of two >> things: >> >> * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can >> eventually become corrupted or worse see unbounded growth as we >> corrupt the chunk list. >> >> or >> >> * non-C11 atomics are more optimal for C11 atomics and we can adjust >> the C11 atomics to be as good as the non-C11 versions with a little >> help from Arm and IBM. > > It's also possible there's a different algorithm that's fast to > implement with C11 atomics. Or the existing C11-style atomics for Aarch64 and POWER64 are pessimistic and we might do better. > I'm not a concurrency expert, but I really don't see a way to avoid the > additional barriers with the current algorithm. It would also be worth > checking if fastbins are even a win on these architectures. Right, it needs a review. At least on x86_64 when we did the tcache vs. fastbin testing the addition of fastbin was still a win. We'd have to run the simulator and workload on Aarch64 to see, which we can ask Arm to do... and IBM for POWER64.
On 16/01/2019 16:30, Carlos O'Donell wrote: > On 1/16/19 11:04 AM, Florian Weimer wrote: >> * Carlos O'Donell: >> >>> On 1/16/19 7:30 AM, Florian Weimer wrote: >>>> * Anton Blanchard: >>>> >>>>> Hi Florian, >>>>> >>>>>>> I see a 16% regression on ppc64le with a simple threaded malloc test >>>>>>> case. I guess the C11 atomics aren't as good as what we have in >>>>>>> glibc. >>>>>> >>>>>> Uh-oh. Would you please check if replacing the two >>>>>> atomic_load_acquire with atomic_load_relaxed restore the previous >>>>>> performance? >>>>> >>>>> As you suspect, doing this does restore the performance. The two lwsync >>>>> barrier instructions must be causing the slow down. >>>> >>>> Okay, I'll post a patch to revert that commit. >>>> >>>> However, the old code had this in _int_malloc (where the arena lock is >>>> acquired): >>>> >>>> #define REMOVE_FB(fb, victim, pp) \ >>>> do \ >>>> { \ >>>> victim = pp; \ >>>> if (victim == NULL) \ >>>> break; \ >>>> } \ >>>> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ >>>> != victim); \ >>>> … >>>> REMOVE_FB (fb, pp, victim); >>>> >>>> And this in _int_free (without the arena lock): >>>> >>>> do >>>> { >>>> /* Check that the top of the bin is not the record we are going to >>>> add (i.e., double free). */ >>>> if (__builtin_expect (old == p, 0)) >>>> malloc_printerr ("double free or corruption (fasttop)"); >>>> p->fd = old2 = old; >>>> } >>>> while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) >>>> != old2); >>>> >>>> I really don't see what makes sure that the store of p->fd happens >>>> before the load of victim->fd. >>>> >>>> It works out on POWER for some reason. I'm attaching a test case that >>>> should exercise these two code paths. >>> >>> This is still not a root cause analysis of the problem. >> >> We know the root of the performance problem, the additional >> synchronization, and, on Aarch64, a consequence of what was supposed to >> be a minor algorithmic change. > > OK, I hadn't seen an analysis of the instruction sequences, so I wasn't > sure if this was completely understood. > > This begs the question: Is the existing code correct for Aarch64 and POWER64? > > I guess we don't know without serious review. > >>> I think it is OK to revert your patch under the pressure of the upcoming >>> release and the fact that this is a serious performance regression. >>> >>> However, there is some due diligence that needs to be done here, and I >>> think the patch should *right back in* as soon as master opens. We already >>> have *lots* of other code using C11-style atomics in glibc, and we want >>> to make sure those work as best we can. This issue is either one of two >>> things: >>> >>> * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can >>> eventually become corrupted or worse see unbounded growth as we >>> corrupt the chunk list. >>> >>> or >>> >>> * non-C11 atomics are more optimal for C11 atomics and we can adjust >>> the C11 atomics to be as good as the non-C11 versions with a little >>> help from Arm and IBM. >> >> It's also possible there's a different algorithm that's fast to >> implement with C11 atomics. > > Or the existing C11-style atomics for Aarch64 and POWER64 are pessimistic > and we might do better. > >> I'm not a concurrency expert, but I really don't see a way to avoid the >> additional barriers with the current algorithm. It would also be worth >> checking if fastbins are even a win on these architectures. > > Right, it needs a review. > > At least on x86_64 when we did the tcache vs. fastbin testing the addition > of fastbin was still a win. > > We'd have to run the simulator and workload on Aarch64 to see, which we > can ask Arm to do... and IBM for POWER64. > we will look into the regression. it is not yet clear what causes it.
On 1/16/19 11:52 AM, Szabolcs Nagy wrote: > On 16/01/2019 16:30, Carlos O'Donell wrote: >> On 1/16/19 11:04 AM, Florian Weimer wrote: >>> * Carlos O'Donell: >>> >>>> On 1/16/19 7:30 AM, Florian Weimer wrote: >>>>> * Anton Blanchard: >>>>> >>>>>> Hi Florian, >>>>>> >>>>>>>> I see a 16% regression on ppc64le with a simple threaded malloc test >>>>>>>> case. I guess the C11 atomics aren't as good as what we have in >>>>>>>> glibc. >>>>>>> >>>>>>> Uh-oh. Would you please check if replacing the two >>>>>>> atomic_load_acquire with atomic_load_relaxed restore the previous >>>>>>> performance? >>>>>> >>>>>> As you suspect, doing this does restore the performance. The two lwsync >>>>>> barrier instructions must be causing the slow down. >>>>> >>>>> Okay, I'll post a patch to revert that commit. >>>>> >>>>> However, the old code had this in _int_malloc (where the arena lock is >>>>> acquired): >>>>> >>>>> #define REMOVE_FB(fb, victim, pp) \ >>>>> do \ >>>>> { \ >>>>> victim = pp; \ >>>>> if (victim == NULL) \ >>>>> break; \ >>>>> } \ >>>>> while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ >>>>> != victim); \ >>>>> … >>>>> REMOVE_FB (fb, pp, victim); >>>>> >>>>> And this in _int_free (without the arena lock): >>>>> >>>>> do >>>>> { >>>>> /* Check that the top of the bin is not the record we are going to >>>>> add (i.e., double free). */ >>>>> if (__builtin_expect (old == p, 0)) >>>>> malloc_printerr ("double free or corruption (fasttop)"); >>>>> p->fd = old2 = old; >>>>> } >>>>> while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) >>>>> != old2); >>>>> >>>>> I really don't see what makes sure that the store of p->fd happens >>>>> before the load of victim->fd. >>>>> >>>>> It works out on POWER for some reason. I'm attaching a test case that >>>>> should exercise these two code paths. >>>> >>>> This is still not a root cause analysis of the problem. >>> >>> We know the root of the performance problem, the additional >>> synchronization, and, on Aarch64, a consequence of what was supposed to >>> be a minor algorithmic change. >> >> OK, I hadn't seen an analysis of the instruction sequences, so I wasn't >> sure if this was completely understood. >> >> This begs the question: Is the existing code correct for Aarch64 and POWER64? >> >> I guess we don't know without serious review. >> >>>> I think it is OK to revert your patch under the pressure of the upcoming >>>> release and the fact that this is a serious performance regression. >>>> >>>> However, there is some due diligence that needs to be done here, and I >>>> think the patch should *right back in* as soon as master opens. We already >>>> have *lots* of other code using C11-style atomics in glibc, and we want >>>> to make sure those work as best we can. This issue is either one of two >>>> things: >>>> >>>> * non-C11 atomics are *wrong* for aarch64/power64 and the fastbins can >>>> eventually become corrupted or worse see unbounded growth as we >>>> corrupt the chunk list. >>>> >>>> or >>>> >>>> * non-C11 atomics are more optimal for C11 atomics and we can adjust >>>> the C11 atomics to be as good as the non-C11 versions with a little >>>> help from Arm and IBM. >>> >>> It's also possible there's a different algorithm that's fast to >>> implement with C11 atomics. >> >> Or the existing C11-style atomics for Aarch64 and POWER64 are pessimistic >> and we might do better. >> >>> I'm not a concurrency expert, but I really don't see a way to avoid the >>> additional barriers with the current algorithm. It would also be worth >>> checking if fastbins are even a win on these architectures. >> >> Right, it needs a review. >> >> At least on x86_64 when we did the tcache vs. fastbin testing the addition >> of fastbin was still a win. >> >> We'd have to run the simulator and workload on Aarch64 to see, which we >> can ask Arm to do... and IBM for POWER64. >> > > we will look into the regression. > > it is not yet clear what causes it. > Awesome. Thank you for that. To be clear my worry is that we're using C11-style atomics all over the place in glibc now, and if they are not correct or really slow we should understand why. This particular change is OK for for me, I fully understand you want the performance back to level it was at before.
diff --git a/malloc/malloc.c b/malloc/malloc.c index bfc605aa3e..7c2186c307 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -1316,6 +1316,77 @@ nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ #define set_foot(p, s) (((mchunkptr) ((char *) (p) + (s)))->mchunk_prev_size = (s)) +/* Add an item to the atomic fastbin list at *ROOT. Returns the old + value at *ROOT. Note that properties of the old chunk are only + stable if the caller has acquired the arena lock. With out the + lock, it can be deallocated at any time. */ +static inline struct malloc_chunk * +fastbin_push_entry (struct malloc_chunk **root, struct malloc_chunk *e) +{ + struct malloc_chunk *head; + if (SINGLE_THREAD_P) + { + /* Check that the top of the bin is not the record we are going + to add (i.e., double free). */ + head = *root; + if (head == e) + malloc_printerr ("double free or corruption (fasttop)"); + e->fd = head; + *root = e; + } + else + do + { + /* Synchronize with the release release MO CAS below. We do + not need synchronization locally, but fastbin_pop_entry and + (especially) malloc_consolidate read the entire list after + synchronizing on the head, so we need to make sure that the + writes to the next (fd) pointers have happened. */ + head = atomic_load_acquire (root); + /* Check that the top of the bin is not the record we are + going to add (i.e., double free). */ + if (head == e) + malloc_printerr ("double free or corruption (fasttop)"); + e->fd = head; + } + /* Synchronizes with the acquire MO CAS in */ + while (!atomic_compare_exchange_weak_release (root, &head, e)); + return head; +} + +/* Remove an item from the atomic fastbin list at *ROOT. The caller + must have acquired the arena lock. */ +static inline struct malloc_chunk * +fastbin_pop_entry (struct malloc_chunk **root) +{ + struct malloc_chunk *head; + if (SINGLE_THREAD_P) + { + head = *root; + if (head != NULL) + *root = head->fd; + } + else + { + /* Synchromizes with the release MO store in fastbin_push_entry. + Synchronization is needed because we read the next list + pointer. */ + head = atomic_load_acquire (root); + struct malloc_chunk *tail; + do + if (head == NULL) + return NULL; + else + tail = head->fd; + /* Synchronizes with the release MO store in fastbin_push_entry. + We do not have an ABA issue here because the caller has + acquired the arena lock, which ensures that there is only one + thread which removes elements from this list. */ + while (!atomic_compare_exchange_weak_acquire (root, &head, tail)); + } + return head; +} + #pragma GCC poison mchunk_size #pragma GCC poison mchunk_prev_size @@ -3559,63 +3630,36 @@ _int_malloc (mstate av, size_t bytes) can try it without checking, which saves some time on this fast path. */ -#define REMOVE_FB(fb, victim, pp) \ - do \ - { \ - victim = pp; \ - if (victim == NULL) \ - break; \ - } \ - while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \ - != victim); \ - if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ())) { idx = fastbin_index (nb); mfastbinptr *fb = &fastbin (av, idx); - mchunkptr pp; - victim = *fb; - + victim = fastbin_pop_entry (fb); if (victim != NULL) { - if (SINGLE_THREAD_P) - *fb = victim->fd; - else - REMOVE_FB (fb, pp, victim); - if (__glibc_likely (victim != NULL)) - { - size_t victim_idx = fastbin_index (chunksize (victim)); - if (__builtin_expect (victim_idx != idx, 0)) - malloc_printerr ("malloc(): memory corruption (fast)"); - check_remalloced_chunk (av, victim, nb); + size_t victim_idx = fastbin_index (chunksize (victim)); + if (victim_idx != idx) + malloc_printerr ("malloc(): memory corruption (fast)"); + check_remalloced_chunk (av, victim, nb); #if USE_TCACHE - /* While we're here, if we see other chunks of the same size, - stash them in the tcache. */ - size_t tc_idx = csize2tidx (nb); - if (tcache && tc_idx < mp_.tcache_bins) + /* While we're here, if we see other chunks of the same size, + stash them in the tcache. */ + size_t tc_idx = csize2tidx (nb); + if (tcache && tc_idx < mp_.tcache_bins) + { + /* While bin not empty and tcache not full, copy chunks. */ + while (tcache->counts[tc_idx] < mp_.tcache_count) { - mchunkptr tc_victim; - - /* While bin not empty and tcache not full, copy chunks. */ - while (tcache->counts[tc_idx] < mp_.tcache_count - && (tc_victim = *fb) != NULL) - { - if (SINGLE_THREAD_P) - *fb = tc_victim->fd; - else - { - REMOVE_FB (fb, pp, tc_victim); - if (__glibc_unlikely (tc_victim == NULL)) - break; - } - tcache_put (tc_victim, tc_idx); - } + mchunkptr tc_victim = fastbin_pop_entry (fb); + if (tc_victim == NULL) + break; + tcache_put (tc_victim, tc_idx); } -#endif - void *p = chunk2mem (victim); - alloc_perturb (p, bytes); - return p; } +#endif + void *p = chunk2mem (victim); + alloc_perturb (p, bytes); + return p; } } @@ -4227,28 +4271,7 @@ _int_free (mstate av, mchunkptr p, int have_lock) fb = &fastbin (av, idx); /* Atomically link P to its fastbin: P->FD = *FB; *FB = P; */ - mchunkptr old = *fb, old2; - - if (SINGLE_THREAD_P) - { - /* Check that the top of the bin is not the record we are going to - add (i.e., double free). */ - if (__builtin_expect (old == p, 0)) - malloc_printerr ("double free or corruption (fasttop)"); - p->fd = old; - *fb = p; - } - else - do - { - /* Check that the top of the bin is not the record we are going to - add (i.e., double free). */ - if (__builtin_expect (old == p, 0)) - malloc_printerr ("double free or corruption (fasttop)"); - p->fd = old2 = old; - } - while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) - != old2); + mchunkptr old = fastbin_push_entry (fb, p); /* Check that size of fastbin chunk at the top is the same as size of the chunk that we are adding. We can dereference OLD @@ -4439,7 +4462,9 @@ static void malloc_consolidate(mstate av) maxfb = &fastbin (av, NFASTBINS - 1); fb = &fastbin (av, 0); do { - p = atomic_exchange_acq (fb, NULL); + /* Synchronizes with the release MO store in + fastbin_push_entry. */ + p = atomic_exchange_acquire (fb, NULL); if (p != 0) { do { {