This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] malloc: Use current (C11-style) atomics for fastbin access


Hi Florian,

> This is another cleanup patch in preparation of the extended heap
> protector (which will cover the fd/bk/fd_nextsize/bk_nextsize fields
> in struct malloc_chunk, too).
> 
> By the way, there is an optimization opportunity for the tcache
> backfill in _int_malloc: After the acquire MO load on the fastbin
> list head, we can traverse the fastbin list as far as we need in
> order to refill the tcache, and update the new list head with a
> single CAS.  This does not have races (ABA races and the like)
> because we have acquired the arena lock at this point.  Some backoff
> is probably needed in case the fastbin list head is contended.  But
> it is probably a good idea to do the first traversal at least once.

I see a 16% regression on ppc64le with a simple threaded malloc test
case. I guess the C11 atomics aren't as good as what we have in glibc.

Thanks,
Anton
--

#include <malloc.h>
#include <pthread.h>
#include <unistd.h>

#define SIZE 8
#define BATCH 128
#define ITERATIONS (50000000/BATCH)

void *ptrs[BATCH];

static void __attribute__((noinline)) do_one(void)
{
	for (unsigned long i = 0; i < BATCH; i++)
		ptrs[i] = malloc(SIZE);

	asm volatile("# %0": : "m"(ptrs[0]));

	for (unsigned long i = 0; i < BATCH; i++)
		free(ptrs[i]);
}

static void *doit(void *junk)
{
	while (1)
		sleep(3600);

	return NULL;
}

int main(void)
{
	unsigned int i;
	pthread_t tid;

	pthread_create(&tid, NULL, doit, NULL);

	for (i = 0; i < ITERATIONS; i++)
		do_one();

	return 0;
}

> Thanks,
> Florian
> 
> 2018-11-12  Florian Weimer  <fweimer@redhat.com>
> 
> 	* malloc/malloc.c (fastbin_push_entry): New function.
> 	(fastbin_pop_entry): Likewise.  Replaces REMOVE_FB.
> 	(REMOVE_FB): Remove macro.
> 	(_int_malloc): Use fastbin_pop_entry and reindent.
> 	(_int_free): Use fastbin_push_entry.
> 	(malloc_consolidate): Use atomic_exchange_acquire.
> 
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index bfc605aa3e..7c2186c307 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -1316,6 +1316,77 @@ nextchunk->
> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
> #define set_foot(p, s)       (((mchunkptr) ((char *) (p) +
> (s)))->mchunk_prev_size = (s)) 
> +/* Add an item to the atomic fastbin list at *ROOT.  Returns the old
> +   value at *ROOT.  Note that properties of the old chunk are only
> +   stable if the caller has acquired the arena lock.  With out the
> +   lock, it can be deallocated at any time.  */
> +static inline struct malloc_chunk *
> +fastbin_push_entry (struct malloc_chunk **root, struct malloc_chunk
> *e) +{
> +  struct malloc_chunk *head;
> +  if (SINGLE_THREAD_P)
> +    {
> +      /* Check that the top of the bin is not the record we are going
> +	 to add (i.e., double free).  */
> +      head = *root;
> +      if (head == e)
> +	malloc_printerr ("double free or corruption (fasttop)");
> +      e->fd = head;
> +      *root = e;
> +    }
> +  else
> +    do
> +      {
> +	/* Synchronize with the release release MO CAS below.  We do
> +	   not need synchronization locally, but fastbin_pop_entry
> and
> +	   (especially) malloc_consolidate read the entire list after
> +	   synchronizing on the head, so we need to make sure that
> the
> +	   writes to the next (fd) pointers have happened.  */
> +	head = atomic_load_acquire (root);
> +	/* Check that the top of the bin is not the record we are
> +	   going to add (i.e., double free).  */
> +	if (head == e)
> +	  malloc_printerr ("double free or corruption (fasttop)");
> +	e->fd = head;
> +      }
> +  /* Synchronizes with the acquire MO CAS in  */
> +    while (!atomic_compare_exchange_weak_release (root, &head, e));
> +  return head;
> +}
> +
> +/* Remove an item from the atomic fastbin list at *ROOT.  The caller
> +   must have acquired the arena lock.  */
> +static inline struct malloc_chunk *
> +fastbin_pop_entry (struct malloc_chunk **root)
> +{
> +  struct malloc_chunk *head;
> +  if (SINGLE_THREAD_P)
> +    {
> +      head = *root;
> +      if (head != NULL)
> +	*root = head->fd;
> +    }
> +  else
> +    {
> +      /* Synchromizes with the release MO store in
> fastbin_push_entry.
> +	 Synchronization is needed because we read the next list
> +	 pointer.  */
> +      head = atomic_load_acquire (root);
> +      struct malloc_chunk *tail;
> +      do
> +	if (head == NULL)
> +	  return NULL;
> +	else
> +	  tail = head->fd;
> +      /* Synchronizes with the release MO store in
> fastbin_push_entry.
> +	 We do not have an ABA issue here because the caller has
> +	 acquired the arena lock, which ensures that there is only
> one
> +	 thread which removes elements from this list.  */
> +      while (!atomic_compare_exchange_weak_acquire (root, &head,
> tail));
> +    }
> +  return head;
> +}
> +
>  #pragma GCC poison mchunk_size
>  #pragma GCC poison mchunk_prev_size
>  
> @@ -3559,63 +3630,36 @@ _int_malloc (mstate av, size_t bytes)
>       can try it without checking, which saves some time on this fast
> path. */
>  
> -#define REMOVE_FB(fb, victim, pp)			\
> -  do							\
> -    {							\
> -      victim = pp;					\
> -      if (victim == NULL)				\
> -	break;						\
> -    }							\
> -  while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd,
> victim)) \
> -	 != victim);					\
> -
>    if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
>      {
>        idx = fastbin_index (nb);
>        mfastbinptr *fb = &fastbin (av, idx);
> -      mchunkptr pp;
> -      victim = *fb;
> -
> +      victim = fastbin_pop_entry (fb);
>        if (victim != NULL)
>  	{
> -	  if (SINGLE_THREAD_P)
> -	    *fb = victim->fd;
> -	  else
> -	    REMOVE_FB (fb, pp, victim);
> -	  if (__glibc_likely (victim != NULL))
> -	    {
> -	      size_t victim_idx = fastbin_index (chunksize (victim));
> -	      if (__builtin_expect (victim_idx != idx, 0))
> -		malloc_printerr ("malloc(): memory corruption
> (fast)");
> -	      check_remalloced_chunk (av, victim, nb);
> +	  size_t victim_idx = fastbin_index (chunksize (victim));
> +	  if (victim_idx != idx)
> +	    malloc_printerr ("malloc(): memory corruption (fast)");
> +	  check_remalloced_chunk (av, victim, nb);
>  #if USE_TCACHE
> -	      /* While we're here, if we see other chunks of the
> same size,
> -		 stash them in the tcache.  */
> -	      size_t tc_idx = csize2tidx (nb);
> -	      if (tcache && tc_idx < mp_.tcache_bins)
> +	  /* While we're here, if we see other chunks of the same
> size,
> +	     stash them in the tcache.  */
> +	  size_t tc_idx = csize2tidx (nb);
> +	  if (tcache && tc_idx < mp_.tcache_bins)
> +	    {
> +	      /* While bin not empty and tcache not full, copy
> chunks.  */
> +	      while (tcache->counts[tc_idx] < mp_.tcache_count)
>  		{
> -		  mchunkptr tc_victim;
> -
> -		  /* While bin not empty and tcache not full, copy
> chunks.  */
> -		  while (tcache->counts[tc_idx] < mp_.tcache_count
> -			 && (tc_victim = *fb) != NULL)
> -		    {
> -		      if (SINGLE_THREAD_P)
> -			*fb = tc_victim->fd;
> -		      else
> -			{
> -			  REMOVE_FB (fb, pp, tc_victim);
> -			  if (__glibc_unlikely (tc_victim == NULL))
> -			    break;
> -			}
> -		      tcache_put (tc_victim, tc_idx);
> -		    }
> +		  mchunkptr tc_victim = fastbin_pop_entry (fb);
> +		  if (tc_victim == NULL)
> +		    break;
> +		  tcache_put (tc_victim, tc_idx);
>  		}
> -#endif
> -	      void *p = chunk2mem (victim);
> -	      alloc_perturb (p, bytes);
> -	      return p;
>  	    }
> +#endif
> +	  void *p = chunk2mem (victim);
> +	  alloc_perturb (p, bytes);
> +	  return p;
>  	}
>      }
>  
> @@ -4227,28 +4271,7 @@ _int_free (mstate av, mchunkptr p, int
> have_lock) fb = &fastbin (av, idx);
>  
>      /* Atomically link P to its fastbin: P->FD = *FB; *FB = P;  */
> -    mchunkptr old = *fb, old2;
> -
> -    if (SINGLE_THREAD_P)
> -      {
> -	/* Check that the top of the bin is not the record we are
> going to
> -	   add (i.e., double free).  */
> -	if (__builtin_expect (old == p, 0))
> -	  malloc_printerr ("double free or corruption (fasttop)");
> -	p->fd = old;
> -	*fb = p;
> -      }
> -    else
> -      do
> -	{
> -	  /* Check that the top of the bin is not the record we are
> going to
> -	     add (i.e., double free).  */
> -	  if (__builtin_expect (old == p, 0))
> -	    malloc_printerr ("double free or corruption (fasttop)");
> -	  p->fd = old2 = old;
> -	}
> -      while ((old = catomic_compare_and_exchange_val_rel (fb, p,
> old2))
> -	     != old2);
> +    mchunkptr old = fastbin_push_entry (fb, p);
>  
>      /* Check that size of fastbin chunk at the top is the same as
>         size of the chunk that we are adding.  We can dereference OLD
> @@ -4439,7 +4462,9 @@ static void malloc_consolidate(mstate av)
>    maxfb = &fastbin (av, NFASTBINS - 1);
>    fb = &fastbin (av, 0);
>    do {
> -    p = atomic_exchange_acq (fb, NULL);
> +    /* Synchronizes with the release MO store in
> +       fastbin_push_entry.  */
> +    p = atomic_exchange_acquire (fb, NULL);
>      if (p != 0) {
>        do {
>  	{


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]