This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] malloc: Use current (C11-style) atomics for fastbin access
- From: Anton Blanchard <anton at ozlabs dot org>
- To: Florian Weimer <fweimer at redhat dot com>
- Cc: libc-alpha at sourceware dot org, Tulio Magno Quites Machado Filho <tuliom at linux dot vnet dot ibm dot com>, Paul Clarke <pc at us dot ibm dot com>, Bill Schmidt <wschmidt at us dot ibm dot com>
- Date: Wed, 16 Jan 2019 09:26:55 +1100
- Subject: Re: [PATCH] malloc: Use current (C11-style) atomics for fastbin access
- References: <87va52nupb.fsf@oldenburg.str.redhat.com>
Hi Florian,
> This is another cleanup patch in preparation of the extended heap
> protector (which will cover the fd/bk/fd_nextsize/bk_nextsize fields
> in struct malloc_chunk, too).
>
> By the way, there is an optimization opportunity for the tcache
> backfill in _int_malloc: After the acquire MO load on the fastbin
> list head, we can traverse the fastbin list as far as we need in
> order to refill the tcache, and update the new list head with a
> single CAS. This does not have races (ABA races and the like)
> because we have acquired the arena lock at this point. Some backoff
> is probably needed in case the fastbin list head is contended. But
> it is probably a good idea to do the first traversal at least once.
I see a 16% regression on ppc64le with a simple threaded malloc test
case. I guess the C11 atomics aren't as good as what we have in glibc.
Thanks,
Anton
--
#include <malloc.h>
#include <pthread.h>
#include <unistd.h>
#define SIZE 8
#define BATCH 128
#define ITERATIONS (50000000/BATCH)
void *ptrs[BATCH];
static void __attribute__((noinline)) do_one(void)
{
for (unsigned long i = 0; i < BATCH; i++)
ptrs[i] = malloc(SIZE);
asm volatile("# %0": : "m"(ptrs[0]));
for (unsigned long i = 0; i < BATCH; i++)
free(ptrs[i]);
}
static void *doit(void *junk)
{
while (1)
sleep(3600);
return NULL;
}
int main(void)
{
unsigned int i;
pthread_t tid;
pthread_create(&tid, NULL, doit, NULL);
for (i = 0; i < ITERATIONS; i++)
do_one();
return 0;
}
> Thanks,
> Florian
>
> 2018-11-12 Florian Weimer <fweimer@redhat.com>
>
> * malloc/malloc.c (fastbin_push_entry): New function.
> (fastbin_pop_entry): Likewise. Replaces REMOVE_FB.
> (REMOVE_FB): Remove macro.
> (_int_malloc): Use fastbin_pop_entry and reindent.
> (_int_free): Use fastbin_push_entry.
> (malloc_consolidate): Use atomic_exchange_acquire.
>
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index bfc605aa3e..7c2186c307 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -1316,6 +1316,77 @@ nextchunk->
> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> #define set_foot(p, s) (((mchunkptr) ((char *) (p) +
> (s)))->mchunk_prev_size = (s))
> +/* Add an item to the atomic fastbin list at *ROOT. Returns the old
> + value at *ROOT. Note that properties of the old chunk are only
> + stable if the caller has acquired the arena lock. With out the
> + lock, it can be deallocated at any time. */
> +static inline struct malloc_chunk *
> +fastbin_push_entry (struct malloc_chunk **root, struct malloc_chunk
> *e) +{
> + struct malloc_chunk *head;
> + if (SINGLE_THREAD_P)
> + {
> + /* Check that the top of the bin is not the record we are going
> + to add (i.e., double free). */
> + head = *root;
> + if (head == e)
> + malloc_printerr ("double free or corruption (fasttop)");
> + e->fd = head;
> + *root = e;
> + }
> + else
> + do
> + {
> + /* Synchronize with the release release MO CAS below. We do
> + not need synchronization locally, but fastbin_pop_entry
> and
> + (especially) malloc_consolidate read the entire list after
> + synchronizing on the head, so we need to make sure that
> the
> + writes to the next (fd) pointers have happened. */
> + head = atomic_load_acquire (root);
> + /* Check that the top of the bin is not the record we are
> + going to add (i.e., double free). */
> + if (head == e)
> + malloc_printerr ("double free or corruption (fasttop)");
> + e->fd = head;
> + }
> + /* Synchronizes with the acquire MO CAS in */
> + while (!atomic_compare_exchange_weak_release (root, &head, e));
> + return head;
> +}
> +
> +/* Remove an item from the atomic fastbin list at *ROOT. The caller
> + must have acquired the arena lock. */
> +static inline struct malloc_chunk *
> +fastbin_pop_entry (struct malloc_chunk **root)
> +{
> + struct malloc_chunk *head;
> + if (SINGLE_THREAD_P)
> + {
> + head = *root;
> + if (head != NULL)
> + *root = head->fd;
> + }
> + else
> + {
> + /* Synchromizes with the release MO store in
> fastbin_push_entry.
> + Synchronization is needed because we read the next list
> + pointer. */
> + head = atomic_load_acquire (root);
> + struct malloc_chunk *tail;
> + do
> + if (head == NULL)
> + return NULL;
> + else
> + tail = head->fd;
> + /* Synchronizes with the release MO store in
> fastbin_push_entry.
> + We do not have an ABA issue here because the caller has
> + acquired the arena lock, which ensures that there is only
> one
> + thread which removes elements from this list. */
> + while (!atomic_compare_exchange_weak_acquire (root, &head,
> tail));
> + }
> + return head;
> +}
> +
> #pragma GCC poison mchunk_size
> #pragma GCC poison mchunk_prev_size
>
> @@ -3559,63 +3630,36 @@ _int_malloc (mstate av, size_t bytes)
> can try it without checking, which saves some time on this fast
> path. */
>
> -#define REMOVE_FB(fb, victim, pp) \
> - do \
> - { \
> - victim = pp; \
> - if (victim == NULL) \
> - break; \
> - } \
> - while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd,
> victim)) \
> - != victim); \
> -
> if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
> {
> idx = fastbin_index (nb);
> mfastbinptr *fb = &fastbin (av, idx);
> - mchunkptr pp;
> - victim = *fb;
> -
> + victim = fastbin_pop_entry (fb);
> if (victim != NULL)
> {
> - if (SINGLE_THREAD_P)
> - *fb = victim->fd;
> - else
> - REMOVE_FB (fb, pp, victim);
> - if (__glibc_likely (victim != NULL))
> - {
> - size_t victim_idx = fastbin_index (chunksize (victim));
> - if (__builtin_expect (victim_idx != idx, 0))
> - malloc_printerr ("malloc(): memory corruption
> (fast)");
> - check_remalloced_chunk (av, victim, nb);
> + size_t victim_idx = fastbin_index (chunksize (victim));
> + if (victim_idx != idx)
> + malloc_printerr ("malloc(): memory corruption (fast)");
> + check_remalloced_chunk (av, victim, nb);
> #if USE_TCACHE
> - /* While we're here, if we see other chunks of the
> same size,
> - stash them in the tcache. */
> - size_t tc_idx = csize2tidx (nb);
> - if (tcache && tc_idx < mp_.tcache_bins)
> + /* While we're here, if we see other chunks of the same
> size,
> + stash them in the tcache. */
> + size_t tc_idx = csize2tidx (nb);
> + if (tcache && tc_idx < mp_.tcache_bins)
> + {
> + /* While bin not empty and tcache not full, copy
> chunks. */
> + while (tcache->counts[tc_idx] < mp_.tcache_count)
> {
> - mchunkptr tc_victim;
> -
> - /* While bin not empty and tcache not full, copy
> chunks. */
> - while (tcache->counts[tc_idx] < mp_.tcache_count
> - && (tc_victim = *fb) != NULL)
> - {
> - if (SINGLE_THREAD_P)
> - *fb = tc_victim->fd;
> - else
> - {
> - REMOVE_FB (fb, pp, tc_victim);
> - if (__glibc_unlikely (tc_victim == NULL))
> - break;
> - }
> - tcache_put (tc_victim, tc_idx);
> - }
> + mchunkptr tc_victim = fastbin_pop_entry (fb);
> + if (tc_victim == NULL)
> + break;
> + tcache_put (tc_victim, tc_idx);
> }
> -#endif
> - void *p = chunk2mem (victim);
> - alloc_perturb (p, bytes);
> - return p;
> }
> +#endif
> + void *p = chunk2mem (victim);
> + alloc_perturb (p, bytes);
> + return p;
> }
> }
>
> @@ -4227,28 +4271,7 @@ _int_free (mstate av, mchunkptr p, int
> have_lock) fb = &fastbin (av, idx);
>
> /* Atomically link P to its fastbin: P->FD = *FB; *FB = P; */
> - mchunkptr old = *fb, old2;
> -
> - if (SINGLE_THREAD_P)
> - {
> - /* Check that the top of the bin is not the record we are
> going to
> - add (i.e., double free). */
> - if (__builtin_expect (old == p, 0))
> - malloc_printerr ("double free or corruption (fasttop)");
> - p->fd = old;
> - *fb = p;
> - }
> - else
> - do
> - {
> - /* Check that the top of the bin is not the record we are
> going to
> - add (i.e., double free). */
> - if (__builtin_expect (old == p, 0))
> - malloc_printerr ("double free or corruption (fasttop)");
> - p->fd = old2 = old;
> - }
> - while ((old = catomic_compare_and_exchange_val_rel (fb, p,
> old2))
> - != old2);
> + mchunkptr old = fastbin_push_entry (fb, p);
>
> /* Check that size of fastbin chunk at the top is the same as
> size of the chunk that we are adding. We can dereference OLD
> @@ -4439,7 +4462,9 @@ static void malloc_consolidate(mstate av)
> maxfb = &fastbin (av, NFASTBINS - 1);
> fb = &fastbin (av, 0);
> do {
> - p = atomic_exchange_acq (fb, NULL);
> + /* Synchronizes with the release MO store in
> + fastbin_push_entry. */
> + p = atomic_exchange_acquire (fb, NULL);
> if (p != 0) {
> do {
> {