This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] malloc: brain-dead thread cache
- From: Joern Engel <joern at purestorage dot com>
- To: "GNU C. Library" <libc-alpha at sourceware dot org>
- Cc: Siddhesh Poyarekar <siddhesh dot poyarekar at gmail dot com>, Joern Engel <joern at purestorage dot org>
- Date: Mon, 25 Jan 2016 16:25:07 -0800
- Subject: [PATCH] malloc: brain-dead thread cache
- Authentication-results: sourceware.org; auth=none
- References: <1453767942-19369-1-git-send-email-joern at purestorage dot com>
From: Joern Engel <joern@purestorage.org>
No prefetching yet, cache_gc takes a lock for every single object,
entire cache gets flushed on gc,...
JIRA: PURE-27597
---
tpc/malloc2.13/arena.h | 2 +
tpc/malloc2.13/malloc.c | 11 +++
tpc/malloc2.13/tcache.h | 183 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 196 insertions(+)
create mode 100644 tpc/malloc2.13/tcache.h
diff --git a/tpc/malloc2.13/arena.h b/tpc/malloc2.13/arena.h
index 0804ecfe3a26..d66b4e7029a2 100644
--- a/tpc/malloc2.13/arena.h
+++ b/tpc/malloc2.13/arena.h
@@ -72,6 +72,7 @@ extern int sanity_check_heap_info_alignment[(sizeof (heap_info)
/* Thread specific data */
+static tsd_key_t cache_key;
static tsd_key_t arena_key;
static mutex_t list_lock;
@@ -362,6 +363,7 @@ static void ptmalloc_init(void)
}
mutex_init(&list_lock);
+ tsd_key_create(&cache_key, NULL);
tsd_key_create(&arena_key, NULL);
tsd_setspecific(arena_key, (Void_t *) & main_arena);
thread_atfork(ptmalloc_lock_all, ptmalloc_unlock_all, ptmalloc_unlock_all2);
diff --git a/tpc/malloc2.13/malloc.c b/tpc/malloc2.13/malloc.c
index dca97ef553c0..40f6aa578c6f 100644
--- a/tpc/malloc2.13/malloc.c
+++ b/tpc/malloc2.13/malloc.c
@@ -3254,6 +3254,8 @@ static struct malloc_state *get_backup_arena(struct malloc_state *arena, size_t
/*------------------------ Public wrappers. --------------------------------*/
+#include "tcache.h"
+
Void_t *public_mALLOc(size_t bytes)
{
struct malloc_state *ar_ptr;
@@ -3263,6 +3265,10 @@ Void_t *public_mALLOc(size_t bytes)
if (hook != NULL)
return (*hook) (bytes, RETURN_ADDRESS(0));
+ victim = tcache_malloc(bytes);
+ if (victim)
+ return victim;
+
ar_ptr = arena_get(bytes);
if (!ar_ptr)
return 0;
@@ -3297,6 +3303,11 @@ void public_fREe(Void_t * mem)
return;
}
+ if (tcache_free(p)) {
+ /* Object could be freed on fast path */
+ return;
+ }
+
ar_ptr = arena_for_chunk(p);
arena_lock(ar_ptr);
_int_free(ar_ptr, p);
diff --git a/tpc/malloc2.13/tcache.h b/tpc/malloc2.13/tcache.h
new file mode 100644
index 000000000000..62d58cc77475
--- /dev/null
+++ b/tpc/malloc2.13/tcache.h
@@ -0,0 +1,183 @@
+/*
+ * Thread-cache for malloc.
+ */
+
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+static inline int fls(int x)
+{
+ int r;
+
+ asm("bsrl %1,%0\n\t"
+ "jnz 1f\n\t"
+ "movl $-1,%0\n"
+ "1:" : "=r" (r) : "rm" (x));
+ return r + 1;
+}
+#else
+#error Must define fls()
+#endif
+
+/*
+ * Per-thread cache is supposed to reduce lock contention on arenas.
+ * When doing allocations we prefetch several identical objects and
+ * can return the surplus on future allocations without going to an
+ * arena. On free we keep the freed object in hope of reusing it in
+ * future allocations.
+ */
+#define CACHE_SIZE (1 << 16)
+#define MAX_CACHED_SIZE (CACHE_SIZE >> 3)
+#define MAX_PREFETCH_SIZE (CACHE_SIZE >> 6)
+#define NO_PREFECT (1 << 3)
+
+/*
+ * Binning is done as a subdivided buddy allocator. A buddy allocator
+ * has one bin per power of two. We use 16 bins per power of two,
+ * yielding a worst-case fragmentation of 6%.
+ *
+ * Worst-case fragmentation is nearly impossible to reach. In bad
+ * real-world workloads there is a single dominant allocation size,
+ * causing most objects to be created with a perfect size.
+ */
+#define ALIGN_BITS (4)
+#define ALIGNMENT (1 << ALIGN_BITS)
+#define ALIGN_DOWN(x) ((x) & ~(ALIGNMENT - 1))
+#define ALIGN(x) (((x) + (ALIGNMENT - 1)) & ~(ALIGNMENT - 1))
+#define SUBBIN_BITS (4)
+#define SUBBINS (1 << SUBBIN_BITS)
+
+static int cache_bin(int size)
+{
+ int power, subbin;
+
+ size >>= ALIGN_BITS;
+ if (size < SUBBINS)
+ return size;
+ power = fls(size);
+ subbin = size >> (power - SUBBIN_BITS - 1);
+ return SUBBINS * (power - SUBBIN_BITS - 1) + subbin;
+}
+
+#define CACHE_NO_BINS 97
+
+struct thread_cache {
+ /* Bytes in cache */
+ uint32_t tc_size;
+
+ /* Objects in cache */
+ uint32_t tc_count;
+
+ struct malloc_chunk *tc_bin[CACHE_NO_BINS];
+};
+
+/*
+ * XXX Completely unoptimized - lots of low-hanging fruit
+ */
+static void cache_gc(struct thread_cache *cache)
+{
+ struct malloc_chunk *victim, *next;
+ struct malloc_state *arena;
+ int i;
+
+ for (i = 0; i < CACHE_NO_BINS; i++) {
+ victim = cache->tc_bin[i];
+ if (!victim)
+ continue;
+ cache->tc_bin[i] = NULL;
+ while (victim) {
+ next = victim->fd;
+ cache->tc_size -= chunksize(victim);
+ cache->tc_count--;
+ arena = arena_for_chunk(victim);
+ mutex_lock(&arena->mutex);
+ _int_free(arena, victim);
+ mutex_unlock(&arena->mutex);
+ victim = next;
+ }
+ }
+ assert(cache->tc_size == 0);
+ assert(cache->tc_count == 0);
+}
+
+static void *tcache_malloc(size_t size)
+{
+ struct thread_cache *cache;
+ struct malloc_state *arena;
+ struct malloc_chunk **bin, *victim;
+ size_t nb;
+ int bin_no;
+
+ checked_request2size(size, nb);
+ if (nb > MAX_CACHED_SIZE)
+ return NULL;
+
+ tsd_getspecific(cache_key, cache);
+ if (!cache) {
+ arena = arena_get(sizeof(*cache));
+ cache = _int_malloc(arena, sizeof(*cache));
+ arena_unlock(arena);
+ if (!cache)
+ return NULL;
+ memset(cache, 0, sizeof(*cache));
+ tsd_setspecific(cache_key, cache);
+ }
+
+ bin_no = cache_bin(nb);
+ assert(bin_no < CACHE_NO_BINS);
+
+ bin = &cache->tc_bin[bin_no];
+ victim = *bin;
+ if (victim) {
+ if (chunksize(victim) < nb)
+ return NULL;
+ if (cache_bin(chunksize(*bin)) != bin_no) {
+ malloc_printerr(check_action, "invalid tcache entry", victim);
+ return NULL;
+ }
+ *bin = victim->fd;
+ void *p = chunk2mem(victim);
+ if (perturb_byte)
+ alloc_perturb(p, size);
+ cache->tc_size -= chunksize(victim);
+ cache->tc_count--;
+ return p;
+ }
+ /* TODO: prefetch objects */
+ return NULL;
+}
+
+/*
+ * returns 1 if object was freed
+ */
+static int tcache_free(mchunkptr p)
+{
+ struct thread_cache *cache;
+ struct malloc_chunk **bin;
+ size_t size;
+ int bin_no;
+
+ tsd_getspecific(cache_key, cache);
+ if (!cache)
+ return 0;
+ size = chunksize(p);
+ if (size > MAX_CACHED_SIZE)
+ return 0;
+ bin_no = cache_bin(size);
+ assert(bin_no < CACHE_NO_BINS);
+
+ cache->tc_size += size;
+ cache->tc_count++;
+ bin = &cache->tc_bin[bin_no];
+ if (*bin == p) {
+ malloc_printerr(check_action, "double free or corruption (tcache)", chunk2mem(p));
+ return 0;
+ }
+ if (*bin && cache_bin(chunksize(*bin)) != bin_no) {
+ malloc_printerr(check_action, "invalid tcache entry", chunk2mem(p));
+ return 0;
+ }
+ p->fd = *bin;
+ *bin = p;
+ if (cache->tc_size > CACHE_SIZE)
+ cache_gc(cache);
+ return 1;
+}
--
2.7.0.rc3