This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[WIP] Improving malloc by atomicless cache.


Hi, my main idea how speed up malloc is to avoid atomic instructions for
small sizes. These are slow (about 30 cycles in best case)

For these we keep a thread-local cache. As we already need tls to
determine thread's arena a tls access do not incur additional overhead
over existing implementation.

I wrote a similar proposal earlier and one comment was that for signals
a deadlock is preferred to a incomprehensible corruption. This could be
solved by making fast path signal safe. Making entire allocator signal
safe has minimal performance impact.

There are several steps that need to be taken, first is adding cache.
A code below is a wrapper over libc malloc that implements a simple
cache (for now you need to link it as library, LD_PRELOAD does not work yet).

Main problem here is limiting memory in caches, as it is harder to return
back to system. One approach would be enforce that total memory used is
at most a configurable parameter (say 4mb by default.), second that
there is less memory in cache than memory allocated by user.

I did not add a check that memory is from arena of given thread, these
are separate topic that will be handled by separated patch. With bit of
care we could implement free without locking foreign arenas by adding atomic
queue for each arena and having malloc/free atomicaly replace that queue
by empty one and doing actual deallocation.

Then we could decrease memory consumption by using different data
layout. For example for small requests we will have a dedicated page
where all requests have same size. Then we do not need headers, we just
look to start of page to read size.



#define _GNU_SOURCE
#include <dlfcn.h>
#include <stdlib.h>
#include <pthread.h>
#include <string.h>

#ifndef INTERNAL_SIZE_T
#define INTERNAL_SIZE_T size_t
#endif

/* The corresponding word size */
#define SIZE_SZ                (sizeof(INTERNAL_SIZE_T))
#define CHUNKSIZE (2 * SIZE_SZ)

#define PREV_INUSE 0x1
#define IS_MMAPPED 0x2
#define NON_MAIN_ARENA 0x4

#define SIZE_BITS (PREV_INUSE|IS_MMAPPED|NON_MAIN_ARENA)
#define chunksize(p)         ((p)->size & ~(SIZE_BITS))

typedef struct malloc_chunk* mchunkptr;

struct malloc_chunk {

  INTERNAL_SIZE_T      prev_size;  /* Size of previous chunk (if free).  */
  INTERNAL_SIZE_T      size;       /* Size in bytes, including overhead. */

  struct malloc_chunk* fd;         /* double links -- used only if free. */
  struct malloc_chunk* bk;

  /* Only used for large blocks: pointer to next larger size.  */
  struct malloc_chunk* fd_nextsize; /* double links -- used only if free. */
  struct malloc_chunk* bk_nextsize;
};

#define mem2chunk(mem) ((mchunkptr)((char*)(mem) - 2*SIZE_SZ))

static void *(*mallocp)(size_t);
static void *(*reallocp)(void *, size_t);
static void (*freep)(void *);

static pthread_key_t destructor_key;

static void free_thread_data (void *);

static void init_blocks()
{
  pthread_key_create (&destructor_key, free_thread_data);
  mallocp = dlsym (RTLD_NEXT, "malloc");
  reallocp = dlsym (RTLD_NEXT, "realloc");
  freep = dlsym (RTLD_NEXT, "free");
};

char temp[10000];
int tempno;

struct stack
{
  struct stack *next;
};

struct thread_pool
{
  struct stack *stack[17];
  int recursed;

  /* Frees from different stack or signal could be done by atomic addition to 
     this stack, that owner thread consumes.  */
  struct stack *to_free;
  int zombie;
};

static __thread struct thread_pool pool;

static int inited;

static void init_thread ()
{
  if (!inited)
    {
      inited = 1;
      init_blocks ();
      inited = 2;
    }
  if (inited == 1)
    abort();
  pthread_setspecific(destructor_key, &pool);
}

void *malloc_by_mmap (size_t size)
{
  /* In separate patch.  */
  abort();
}

static void free_thread_data (void *data)
{
  size_t idx;
  for (idx = 1; idx <= 16; idx++)
    {
      struct stack *p = pool.stack[idx], *oldp;
      while (p)
        {
          oldp = p;
          p = p->next;
          freep (oldp);
        }
    }
}

#define REC_FREE 1
#define REC_LOCK 2
static inline int as_lock ()
{
  if (pool.recursed != REC_FREE)
    {
      if (pool.recursed == 0)
        {
          init_thread ();
          pool.recursed = REC_FREE; 
        }
      else
        return -1;
    }
  pool.recursed = REC_LOCK;
  return 0;
}

static inline void as_unlock ()
{
  pool.recursed = REC_FREE;
}

void *malloc (size_t size)
{
  size_t idx = (size + CHUNKSIZE - 1) / (CHUNKSIZE);
  
  if (inited == 1) /* as dlsym needs malloc we use temporary allocator */
    {
      char *ret = temp + tempno;
      tempno += idx * CHUNKSIZE;
      return ret;      
    }
  
  if (as_lock ())
    return malloc_by_mmap (size);

  if (idx <= 16)
    {
      size_t idx = (size + CHUNKSIZE - 1) / CHUNKSIZE;

      if (!pool.stack[idx])
        {
          as_unlock ();
          return mallocp (size);
        }
      struct stack *ret = pool.stack[idx];
      pool.stack[idx] = ret->next;

      as_unlock ();
      return (void *) ret;
    }
  else
    {
      as_unlock ();
      return mallocp (size);
    }
}

void *realloc (void *old, size_t size)
{
  return reallocp (old, size);
  if (old == NULL)
    return malloc (size);

  mchunkptr p;
  p = mem2chunk(old);
  size_t oldsize = chunksize (p) - CHUNKSIZE;
  size_t idx = (oldsize + CHUNKSIZE - 1) / CHUNKSIZE;

  if (idx <= 16)
    { 
      if (size > oldsize)
        {
          char *new = malloc (size);
          memcpy (new, old, oldsize);
          free (old);
          return new;
        }
      else
        return old;
    }
  reallocp (old, size);
}

void free (void *mem)
{
  if (!mem)
    return;
  mchunkptr p;
  p = mem2chunk(mem);
  size_t size = chunksize (p) - CHUNKSIZE;
  size_t idx = (size + CHUNKSIZE - 1) / CHUNKSIZE;
 
  if (idx <= 16)
    {
      struct stack *ret = (struct stack *) mem;

      if (as_lock ())
        return;

      ret->next = pool.stack[idx];
      pool.stack[idx] = ret;
     
      as_unlock ();
    }
  else
    freep (p);
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]