This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH] malloc: better inline documentation

From: Joern Engel <joern at purestorage dot com>
To: "GNU C. Library" <libc-alpha at sourceware dot org>
Cc: Siddhesh Poyarekar <siddhesh dot poyarekar at gmail dot com>, Joern Engel <joern at purestorage dot com>
Date: Mon, 25 Jan 2016 16:25:19 -0800
Subject: [PATCH] malloc: better inline documentation
Authentication-results: sourceware.org; auth=none
References: <1453767942-19369-1-git-send-email-joern at purestorage dot com>

JIRA: PURE-27597
---
 tpc/malloc2.13/arena.h  | 20 ++++++++++++++++++++
 tpc/malloc2.13/malloc.c |  7 +++++++
 tpc/malloc2.13/tcache.h | 15 +++++++++++----
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/tpc/malloc2.13/arena.h b/tpc/malloc2.13/arena.h
index 685822897d97..7f50dacb8297 100644
--- a/tpc/malloc2.13/arena.h
+++ b/tpc/malloc2.13/arena.h
@@ -454,6 +454,19 @@ dump_heap(heap_info *heap)
    multiple threads, but only one will succeed.  */
 static char *aligned_heap_area;
 
+/*
+ * Preferentially mmap's huge pages, falling back on small pages if
+ * necessary.  For Pure kernels huge pages are not cleared, so we
+ * have to do so here.  Pushing it down to the caller is a small
+ * optimization in the cases where new_heap() allocated twice the
+ * necessary memory for alignment, then frees the unaligned bits.
+ * Only clearing the remaing half means we spend half the time.
+ *
+ * Only part of the heap has to be cleared, so yet another
+ * optimization would be possible.  Most likely we only need to
+ * clear the areana-header and heap-header.  But someone needs to do
+ * the homework before enabling this optimization.
+ */
 static void *mmap_for_heap(void *addr, size_t length, int *must_clear)
 {
 	int prot = PROT_READ | PROT_WRITE;
@@ -776,6 +789,13 @@ static struct malloc_state *arena_get(size_t size)
 	struct malloc_state *arena = NULL;
 	int node = getnode();
 
+	/*
+	 * getnode() is inherently racy.  It returns the correct node
+	 * number at the time of the syscall, but the thread may be
+	 * migrated to a different node at any moment, even before
+	 * getnode() returns.  Nothing we can do about this, we try
+	 * to use a numa-local arena, but are limited to best-effort.
+	 */
 	tsd_getspecific(arena_key, arena);
 	if (!arena || arena->numa_node != node) {
 		arena = numa_arena[node];
diff --git a/tpc/malloc2.13/malloc.c b/tpc/malloc2.13/malloc.c
index 46b3545aaa8f..18c7b407bbea 100644
--- a/tpc/malloc2.13/malloc.c
+++ b/tpc/malloc2.13/malloc.c
@@ -3247,6 +3247,13 @@ mremap_chunk(mchunkptr p, size_t new_size)
 
 #endif /* HAVE_MREMAP */
 
+/*
+ * Rationale behind this function is that if you cannot find enough
+ * memory through sbrk, which the main_arena uses, you might be
+ * successful with mmap or vice versa.  It is unclear whether the
+ * rationale still makes sense.  I invite anyone to do the mental
+ * excercise and prove we can remove this function.
+ */
 static struct malloc_state *get_backup_arena(struct malloc_state *arena, size_t bytes)
 {
 	if (arena != &main_arena) {
diff --git a/tpc/malloc2.13/tcache.h b/tpc/malloc2.13/tcache.h
index 628dbc00256a..b269498657f3 100644
--- a/tpc/malloc2.13/tcache.h
+++ b/tpc/malloc2.13/tcache.h
@@ -19,10 +19,17 @@ static inline int fls(int x)
 
 /*
  * Per-thread cache is supposed to reduce lock contention on arenas.
- * When doing allocations we prefetch several identical objects and
- * can return the surplus on future allocations without going to an
- * arena.  On free we keep the freed object in hope of reusing it in
- * future allocations.
+ * Freed objects go to the cache first, allowing allocations to be
+ * serviced from it without going to the arenas.  On cache misses we
+ * have to take the arena lock, but can amortize the cost by
+ * prefetching additional objects for future use.
+ *
+ * Prefetching is a heuristic.  If an object of size X is requested,
+ * we assume more objects of the same size will be requested in the
+ * near future.  If correct, this reduces locking overhead.  If
+ * incorrect, we spend cpu cycles and pollute the tcache with unused
+ * objects.  Sweet spot depends on the workload, but seems to be
+ * around one.
  */
 #define CACHE_SIZE_BITS		(17)
 #define CACHE_SIZE		(1 << CACHE_SIZE_BITS)
-- 
2.7.0.rc3

References:
- malloc: performance improvements and bugfixes
  - From: Joern Engel

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]