[PATCH v4 1/1] memalign: Support scanning for aligned chunks.

DJ Delorie dj@redhat.com
Wed Aug 17 19:00:26 GMT 2022


Ping?

https://patchwork.sourceware.org/project/glibc/patch/xn4jz19fts.fsf@greed.delorie.com/
(I can't find it in the archives though, sent July 28th)

DJ Delorie via Libc-alpha <libc-alpha@sourceware.org> writes:
> [v4: updated testcase to new driver]
>
> [note that I am not pushing this patch for this release, the timing is
> coincidence]
>
> This patch adds a chunk scanning algorithm to the _int_memalign code
> path that reduces heap fragmentation by reusing already aligned chunks
> instead of always looking for chunks of larger sizes and splitting
> them.  The tcache macros are extended to allow removing a chunk from
> the middle of the list.
>
> The goal is to fix the pathological use cases where heaps grow
> continuously in workloads that are heavy users of memalign.
>
> Note that tst-memalign-2 checks for tcache operation, which
> malloc-check bypasses.
>
> diff --git a/malloc/Makefile b/malloc/Makefile
> index 4e32de2a0b..084c408ac7 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -43,6 +43,7 @@ tests := mallocbug tst-malloc tst-valloc tst-calloc tst-obstack \
>  	 tst-tcfree1 tst-tcfree2 tst-tcfree3 \
>  	 tst-safe-linking \
>  	 tst-mallocalign1 \
> +	 tst-memalign-2
>  
>  tests-static := \
>  	 tst-interpose-static-nothread \
> @@ -72,7 +73,7 @@ test-srcs = tst-mtrace
>  # with MALLOC_CHECK_=3 because they expect a specific failure.
>  tests-exclude-malloc-check = tst-malloc-check tst-malloc-usable \
>  	tst-mxfast tst-safe-linking \
> -	tst-compathooks-off tst-compathooks-on
> +	tst-compathooks-off tst-compathooks-on tst-memalign-2
>  
>  # Run all tests with MALLOC_CHECK_=3
>  tests-malloc-check = $(filter-out $(tests-exclude-malloc-check) \
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index bd3c76ed31..3b31d6ae0f 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -3180,19 +3180,44 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
>  }
>  
>  /* Caller must ensure that we know tc_idx is valid and there's
> -   available chunks to remove.  */
> +   available chunks to remove.  Removes chunk from the middle of the
> +   list.  */
>  static __always_inline void *
> -tcache_get (size_t tc_idx)
> +tcache_get_n (size_t tc_idx, tcache_entry **ep)
>  {
> -  tcache_entry *e = tcache->entries[tc_idx];
> +  tcache_entry *e;
> +  if (ep == &(tcache->entries[tc_idx]))
> +    e = *ep;
> +  else
> +    e = REVEAL_PTR (*ep);
> +
>    if (__glibc_unlikely (!aligned_OK (e)))
>      malloc_printerr ("malloc(): unaligned tcache chunk detected");
> -  tcache->entries[tc_idx] = REVEAL_PTR (e->next);
> +
> +  if (ep == &(tcache->entries[tc_idx]))
> +      *ep = REVEAL_PTR (e->next);
> +  else
> +    *ep = PROTECT_PTR (ep, REVEAL_PTR (e->next));
> +
>    --(tcache->counts[tc_idx]);
>    e->key = 0;
>    return (void *) e;
>  }
>  
> +/* Like the above, but removes from the head of the list.  */
> +static __always_inline void *
> +tcache_get (size_t tc_idx)
> +{
> +  return tcache_get_n (tc_idx, & tcache->entries[tc_idx]);
> +}
> +
> +/* Iterates through the tcache linked list.  */
> +static __always_inline void *
> +tcache_next (tcache_entry *e)
> +{
> +  return (tcache_entry *) REVEAL_PTR (e->next);
> +}
> +
>  static void
>  tcache_thread_shutdown (void)
>  {
> @@ -3301,7 +3326,7 @@ __libc_malloc (size_t bytes)
>  
>    DIAG_PUSH_NEEDS_COMMENT;
>    if (tc_idx < mp_.tcache_bins
> -      && tcache
> +      && tcache != NULL
>        && tcache->counts[tc_idx] > 0)
>      {
>        victim = tcache_get (tc_idx);
> @@ -3552,6 +3577,38 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
>        alignment = a;
>      }
>  
> +#if USE_TCACHE
> +  {
> +    size_t tbytes;
> +    tbytes = checked_request2size (bytes);
> +    if (tbytes == 0)
> +      {
> +	__set_errno (ENOMEM);
> +	return NULL;
> +      }
> +    size_t tc_idx = csize2tidx (tbytes);
> +
> +    if (tc_idx < mp_.tcache_bins
> +	&& tcache != NULL
> +	&& tcache->counts[tc_idx] > 0)
> +      {
> +	/* The tcache itself isn't encoded, but the chain is.  */
> +	tcache_entry **tep = & tcache->entries[tc_idx];
> +	tcache_entry *te = *tep;
> +	while (te != NULL && ((intptr_t)te & (alignment - 1)) != 0)
> +	  {
> +	    tep = & (te->next);
> +	    te = tcache_next (te);
> +	  }
> +	if (te != NULL)
> +	  {
> +	    void *victim = tcache_get_n (tc_idx, tep);
> +	    return tag_new_usable (victim);
> +	  }
> +      }
> +  }
> +#endif
> +
>    if (SINGLE_THREAD_P)
>      {
>        p = _int_memalign (&main_arena, alignment, bytes);
> @@ -3857,7 +3914,7 @@ _int_malloc (mstate av, size_t bytes)
>  	      /* While we're here, if we see other chunks of the same size,
>  		 stash them in the tcache.  */
>  	      size_t tc_idx = csize2tidx (nb);
> -	      if (tcache && tc_idx < mp_.tcache_bins)
> +	      if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  		{
>  		  mchunkptr tc_victim;
>  
> @@ -3915,7 +3972,7 @@ _int_malloc (mstate av, size_t bytes)
>  	  /* While we're here, if we see other chunks of the same size,
>  	     stash them in the tcache.  */
>  	  size_t tc_idx = csize2tidx (nb);
> -	  if (tcache && tc_idx < mp_.tcache_bins)
> +	  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>  	    {
>  	      mchunkptr tc_victim;
>  
> @@ -3977,7 +4034,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>    INTERNAL_SIZE_T tcache_nb = 0;
>    size_t tc_idx = csize2tidx (nb);
> -  if (tcache && tc_idx < mp_.tcache_bins)
> +  if (tcache != NULL && tc_idx < mp_.tcache_bins)
>      tcache_nb = nb;
>    int return_cached = 0;
>  
> @@ -4059,7 +4116,7 @@ _int_malloc (mstate av, size_t bytes)
>  #if USE_TCACHE
>  	      /* Fill cache first, return to user only if cache fills.
>  		 We may return one of these chunks later.  */
> -	      if (tcache_nb
> +	      if (tcache_nb > 0
>  		  && tcache->counts[tc_idx] < mp_.tcache_count)
>  		{
>  		  tcache_put (victim, tc_idx);
> @@ -4932,6 +4989,43 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
>     ------------------------------ memalign ------------------------------
>   */
>  
> +/* Returns 0 if the chunk is not and does not contain the requested
> +   aligned sub-chunk, else returns the amount of "waste" from
> +   trimming.  BYTES is the *user* byte size, not the chunk byte
> +   size.  */
> +static int
> +chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t bytes)
> +{
> +  void *m = chunk2mem (p);
> +  INTERNAL_SIZE_T size = memsize (p);
> +  void *aligned_m = m;
> +
> +  if (__glibc_unlikely (misaligned_chunk (p)))
> +    malloc_printerr ("_int_memalign(): unaligned chunk detected");
> +
> +  aligned_m = PTR_ALIGN_UP (m, alignment);
> +
> +  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
> +
> +  /* We can't trim off the front as it's too small.  */
> +  if (front_extra > 0 && front_extra < MINSIZE)
> +    return 0;
> +
> +  /* If it's a perfect fit, it's an exception to the return value rule
> +     (we would return zero waste, which looks like "not usable"), so
> +     handle it here by returning a small non-zero value instead.  */
> +  if (size == bytes && front_extra == 0)
> +    return 1;
> +
> +  /* If the block we need fits in the chunk, calculate total waste.  */
> +  if (size > bytes + front_extra)
> +    return size - bytes;
> +
> +  /* Can't use this chunk.  */ 
> +  return 0;
> +}
> +
> +/* BYTES is user requested bytes, not requested chunksize bytes.  */
>  static void *
>  _int_memalign (mstate av, size_t alignment, size_t bytes)
>  {
> @@ -4945,8 +5039,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>    mchunkptr remainder;            /* spare room at end to split off */
>    unsigned long remainder_size;   /* its size */
>    INTERNAL_SIZE_T size;
> -
> -
> +  mchunkptr victim;
>  
>    nb = checked_request2size (bytes);
>    if (nb == 0)
> @@ -4955,29 +5048,142 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
>        return NULL;
>      }
>  
> -  /*
> -     Strategy: find a spot within that chunk that meets the alignment
> +  /* We can't check tcache here because we hold the arena lock, which
> +     tcache doesn't expect.  We expect it has been checked
> +     earlier.  */
> +
> +  /* Strategy: search the bins looking for an existing block that
> +     meets our needs.  We scan a range of bins from "exact size" to
> +     "just under 2x", spanning the small/large barrier if needed.  If
> +     we don't find anything in those bins, the common malloc code will
> +     scan starting at 2x.  */
> +
> +  /* This will be set if we found a candidate chunk.  */
> +  victim = NULL;
> +
> +  /* Fast bins are singly-linked, hard to remove a chunk from the middle
> +     and unlikely to meet our alignment requirements.  We have not done
> +     any experimentation with searching for aligned fastbins.  */
> +
> +  int first_bin_index;
> +  int first_largebin_index;
> +  int last_bin_index;
> +
> +  if (in_smallbin_range (nb))
> +    first_bin_index = smallbin_index (nb);
> +  else
> +    first_bin_index = largebin_index (nb);
> +
> +  if (in_smallbin_range (nb * 2))
> +    last_bin_index = smallbin_index (nb * 2);
> +  else
> +    last_bin_index = largebin_index (nb * 2);
> +
> +  first_largebin_index = largebin_index (MIN_LARGE_SIZE);
> +
> +  int victim_index;                 /* its bin index */
> +
> +  for (victim_index = first_bin_index;
> +       victim_index < last_bin_index;
> +       victim_index ++)
> +    {
> +      victim = NULL;
> +
> +      if (victim_index < first_largebin_index)
> +    {
> +      /* Check small bins.  Small bin chunks are doubly-linked despite
> +	 being the same size.  */
> +
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +      while (fwd != bck)
> +	{
> +	  if (chunk_ok_for_memalign (fwd, alignment, bytes) > 0)
> +	    {
> +	      victim = fwd;
> +
> +	      /* Unlink it */
> +	      victim->fd->bk = victim->bk;
> +	      victim->bk->fd = victim->fd;
> +	      break;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +    }
> +  else
> +    {
> +      /* Check large bins.  */
> +      mchunkptr fwd;                    /* misc temp for linking */
> +      mchunkptr bck;                    /* misc temp for linking */
> +      mchunkptr best = NULL;
> +      size_t best_size = 0;
> +
> +      bck = bin_at (av, victim_index);
> +      fwd = bck->fd;
> +
> +      while (fwd != bck)
> +	{
> +	  int extra;
> +
> +	  if (chunksize (fwd) < nb)
> +	      break;
> +	  extra = chunk_ok_for_memalign (fwd, alignment, bytes);
> +	  if (extra > 0
> +	      && (extra <= best_size || best == NULL))
> +	    {
> +	      best = fwd;
> +	      best_size = extra;
> +	    }
> +
> +	  fwd = fwd->fd;
> +	}
> +      victim = best;
> +
> +      if (victim != NULL)
> +	{
> +	  unlink_chunk (av, victim);
> +	  break;
> +	}
> +    }
> +
> +      if (victim != NULL)
> +	break;
> +    }
> +
> +  /* Strategy: find a spot within that chunk that meets the alignment
>       request, and then possibly free the leading and trailing space.
> -   */
> +     This strategy is incredibly costly and can lead to external
> +     fragmentation if header and footer chunks are unused.  */
>  
> -  /* Call malloc with worst case padding to hit alignment. */
> +  if (victim != NULL)
> +    {
> +      p = victim;
> +      m = chunk2mem (p);
> +      set_inuse (p);
> +    }
> +  else
> +    {
> +      /* Call malloc with worst case padding to hit alignment. */
>  
> -  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
> +      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
>  
> -  if (m == 0)
> -    return 0;           /* propagate failure */
> +      if (m == 0)
> +	return 0;           /* propagate failure */
>  
> -  p = mem2chunk (m);
> +      p = mem2chunk (m);
> +    }
>  
>    if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
> -
> -    { /*
> -                Find an aligned spot inside chunk.  Since we need to give back
> -                leading space in a chunk of at least MINSIZE, if the first
> -                calculation places us at a spot with less than MINSIZE leader,
> -                we can move to the next aligned spot -- we've allocated enough
> -                total room so that this is always possible.
> -                 */
> +    {
> +      /* Find an aligned spot inside chunk.  Since we need to give back
> +         leading space in a chunk of at least MINSIZE, if the first
> +         calculation places us at a spot with less than MINSIZE leader,
> +         we can move to the next aligned spot -- we've allocated enough
> +         total room so that this is always possible.  */
>        brk = (char *) mem2chunk (((unsigned long) (m + alignment - 1)) &
>                                  - ((signed long) alignment));
>        if ((unsigned long) (brk - (char *) (p)) < MINSIZE)
> diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
> new file mode 100644
> index 0000000000..ed3660959a
> --- /dev/null
> +++ b/malloc/tst-memalign-2.c
> @@ -0,0 +1,136 @@
> +/* Test for memalign chunk reuse
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <malloc.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <array_length.h>
> +
> +#include <support/check.h>
> +
> +typedef struct TestCase {
> +  size_t size;
> +  size_t alignment;
> +  void *ptr1;
> +  void *ptr2;
> +} TestCase;
> +
> +static TestCase tcache_allocs[] = {
> +  { 24, 8, NULL, NULL },
> +  { 24, 16, NULL, NULL },
> +  { 128, 32, NULL, NULL }
> +};
> +#define TN array_length (tcache_allocs)
> +
> +static TestCase large_allocs[] = {
> +  { 23450, 64, NULL, NULL },
> +  { 23450, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23550, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 23650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL },
> +  { 33650, 64, NULL, NULL }
> +};
> +#define LN array_length (large_allocs)
> +
> +void *p;
> +
> +static int
> +do_test (void)
> +{
> +  int i, j;
> +  int count;
> +  void *ptr[10];
> +  void *p;
> +
> +  /* TCache test.  */
> +
> +  for (i = 0; i < TN; ++ i)
> +    {
> +      tcache_allocs[i].ptr1 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr1);
> +      /* This should return the same chunk as was just free'd.  */
> +      tcache_allocs[i].ptr2 = memalign (tcache_allocs[i].alignment, tcache_allocs[i].size);
> +      free (tcache_allocs[i].ptr2);
> +
> +      TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
> +    }
> +
> +  /* Test for non-head tcache hits.  */
> +  for (i = 0; i < 10; ++ i)
> +    {
> +      if (i == 4)
> +	ptr[i] = memalign (64, 256);
> +      else
> +	ptr[i] = malloc (256);
> +    }
> +  for (i = 0; i < 10; ++ i)
> +    free (ptr[i]);
> +
> +  p = memalign (64, 256);
> +
> +  count = 0;
> +  for (i = 0; i < 10; ++ i)
> +    if (ptr[i] == p)
> +      ++ count;
> +  free (p);
> +  TEST_VERIFY (count > 0);
> +
> +  /* Large bins test.  */
> +
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      large_allocs[i].ptr1 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +      /* Keep chunks from combining by fragmenting the heap.  */
> +      p = malloc (512);
> +    }
> +
> +  for (i = 0; i < LN; ++ i)
> +    free (large_allocs[i].ptr1);
> +
> +  /* Force the unsorted bins to be scanned and moved to small/large
> +     bins.  */
> +  p = malloc (60000);
> +
> +  for (i = 0; i < LN; ++ i)
> +    large_allocs[i].ptr2 = memalign (large_allocs[i].alignment, large_allocs[i].size);
> +
> +  count = 0;
> +  for (i = 0; i < LN; ++ i)
> +    {
> +      int ok = 0;
> +      for (j = 0; j < LN; ++ j)
> +	if (large_allocs[i].ptr1 == large_allocs[j].ptr2)
> +	  ok = 1;
> +      if (ok == 1)
> +	count ++;
> +    }
> +
> +  /* The allocation algorithm is complicated outside of the memalign
> +     logic, so just make sure it's working for most of the
> +     allocations.  This avoids possible boundary conditions with
> +     empty/full heaps.  */
> +  TEST_VERIFY (count > LN / 2);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>



More information about the Libc-alpha mailing list