This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] malloc: better inline documentation
- From: Joern Engel <joern at purestorage dot com>
- To: "GNU C. Library" <libc-alpha at sourceware dot org>
- Cc: Siddhesh Poyarekar <siddhesh dot poyarekar at gmail dot com>, Joern Engel <joern at purestorage dot com>
- Date: Mon, 25 Jan 2016 16:25:19 -0800
- Subject: [PATCH] malloc: better inline documentation
- Authentication-results: sourceware.org; auth=none
- References: <1453767942-19369-1-git-send-email-joern at purestorage dot com>
JIRA: PURE-27597
---
tpc/malloc2.13/arena.h | 20 ++++++++++++++++++++
tpc/malloc2.13/malloc.c | 7 +++++++
tpc/malloc2.13/tcache.h | 15 +++++++++++----
3 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/tpc/malloc2.13/arena.h b/tpc/malloc2.13/arena.h
index 685822897d97..7f50dacb8297 100644
--- a/tpc/malloc2.13/arena.h
+++ b/tpc/malloc2.13/arena.h
@@ -454,6 +454,19 @@ dump_heap(heap_info *heap)
multiple threads, but only one will succeed. */
static char *aligned_heap_area;
+/*
+ * Preferentially mmap's huge pages, falling back on small pages if
+ * necessary. For Pure kernels huge pages are not cleared, so we
+ * have to do so here. Pushing it down to the caller is a small
+ * optimization in the cases where new_heap() allocated twice the
+ * necessary memory for alignment, then frees the unaligned bits.
+ * Only clearing the remaing half means we spend half the time.
+ *
+ * Only part of the heap has to be cleared, so yet another
+ * optimization would be possible. Most likely we only need to
+ * clear the areana-header and heap-header. But someone needs to do
+ * the homework before enabling this optimization.
+ */
static void *mmap_for_heap(void *addr, size_t length, int *must_clear)
{
int prot = PROT_READ | PROT_WRITE;
@@ -776,6 +789,13 @@ static struct malloc_state *arena_get(size_t size)
struct malloc_state *arena = NULL;
int node = getnode();
+ /*
+ * getnode() is inherently racy. It returns the correct node
+ * number at the time of the syscall, but the thread may be
+ * migrated to a different node at any moment, even before
+ * getnode() returns. Nothing we can do about this, we try
+ * to use a numa-local arena, but are limited to best-effort.
+ */
tsd_getspecific(arena_key, arena);
if (!arena || arena->numa_node != node) {
arena = numa_arena[node];
diff --git a/tpc/malloc2.13/malloc.c b/tpc/malloc2.13/malloc.c
index 46b3545aaa8f..18c7b407bbea 100644
--- a/tpc/malloc2.13/malloc.c
+++ b/tpc/malloc2.13/malloc.c
@@ -3247,6 +3247,13 @@ mremap_chunk(mchunkptr p, size_t new_size)
#endif /* HAVE_MREMAP */
+/*
+ * Rationale behind this function is that if you cannot find enough
+ * memory through sbrk, which the main_arena uses, you might be
+ * successful with mmap or vice versa. It is unclear whether the
+ * rationale still makes sense. I invite anyone to do the mental
+ * excercise and prove we can remove this function.
+ */
static struct malloc_state *get_backup_arena(struct malloc_state *arena, size_t bytes)
{
if (arena != &main_arena) {
diff --git a/tpc/malloc2.13/tcache.h b/tpc/malloc2.13/tcache.h
index 628dbc00256a..b269498657f3 100644
--- a/tpc/malloc2.13/tcache.h
+++ b/tpc/malloc2.13/tcache.h
@@ -19,10 +19,17 @@ static inline int fls(int x)
/*
* Per-thread cache is supposed to reduce lock contention on arenas.
- * When doing allocations we prefetch several identical objects and
- * can return the surplus on future allocations without going to an
- * arena. On free we keep the freed object in hope of reusing it in
- * future allocations.
+ * Freed objects go to the cache first, allowing allocations to be
+ * serviced from it without going to the arenas. On cache misses we
+ * have to take the arena lock, but can amortize the cost by
+ * prefetching additional objects for future use.
+ *
+ * Prefetching is a heuristic. If an object of size X is requested,
+ * we assume more objects of the same size will be requested in the
+ * near future. If correct, this reduces locking overhead. If
+ * incorrect, we spend cpu cycles and pollute the tcache with unused
+ * objects. Sweet spot depends on the workload, but seems to be
+ * around one.
*/
#define CACHE_SIZE_BITS (17)
#define CACHE_SIZE (1 << CACHE_SIZE_BITS)
--
2.7.0.rc3