This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] malloc: remove tcache prefetching


The code doesn't help.  Best I could demonstrate in an artificial test
that spends all its time in malloc/free is 1% performance improvement,
assuming all allocations are the same size.  Once you mix sizes and
there is a chance of prefetched objects being unused and later freed,
the performance degrades.  It was a nice idea, it didn't survive a
reality test, let's remove it.

JIRA: PURE-27597
---
 tpc/malloc2.13/arena.h  |  2 ++
 tpc/malloc2.13/malloc.c |  1 +
 tpc/malloc2.13/tcache.h | 70 +++++--------------------------------------------
 3 files changed, 9 insertions(+), 64 deletions(-)

diff --git a/tpc/malloc2.13/arena.h b/tpc/malloc2.13/arena.h
index f6b108f661dc..587bad51dd63 100644
--- a/tpc/malloc2.13/arena.h
+++ b/tpc/malloc2.13/arena.h
@@ -843,6 +843,7 @@ static struct malloc_state *arena_get(size_t size)
 		THREAD_STAT(++(arena->stat_lock_direct));
 	} else
 		arena = arena_get2(arena, size);
+	free_atomic_list(arena);
 	return arena;
 }
 
@@ -858,6 +859,7 @@ static inline void arena_lock(struct malloc_state *arena)
 #else
 	(void)mutex_lock(&arena->mutex);
 #endif
+	free_atomic_list(arena);
 }
 
 static inline void arena_unlock(struct malloc_state *arena)
diff --git a/tpc/malloc2.13/malloc.c b/tpc/malloc2.13/malloc.c
index c4e3fbada60a..439c1247fe99 100644
--- a/tpc/malloc2.13/malloc.c
+++ b/tpc/malloc2.13/malloc.c
@@ -2274,6 +2274,7 @@ static int perturb_byte;
 } while (0)
 
 
+static void free_atomic_list(struct malloc_state *arena);
 /* ------------------- Support for multiple arenas -------------------- */
 #include "arena.h"
 
diff --git a/tpc/malloc2.13/tcache.h b/tpc/malloc2.13/tcache.h
index 9e210a973d10..78d5d2f17462 100644
--- a/tpc/malloc2.13/tcache.h
+++ b/tpc/malloc2.13/tcache.h
@@ -20,24 +20,12 @@ static inline int fls(int x)
 /*
  * Per-thread cache is supposed to reduce lock contention on arenas.
  * Freed objects go to the cache first, allowing allocations to be
- * serviced from it without going to the arenas.  On cache misses we
- * have to take the arena lock, but can amortize the cost by
- * prefetching additional objects for future use.
- *
- * Prefetching is a heuristic.  If an object of size X is requested,
- * we assume more objects of the same size will be requested in the
- * near future.  If correct, this reduces locking overhead.  If
- * incorrect, we spend cpu cycles and pollute the tcache with unused
- * objects.  Sweet spot depends on the workload, but seems to be
- * around one.
+ * serviced from it without going to the arenas.
  */
 #define CACHE_SIZE_BITS		(17)
 #define CACHE_SIZE		(1 << CACHE_SIZE_BITS)
 #define MAX_CACHED_SIZE_BITS	(CACHE_SIZE_BITS - 3)
 #define MAX_CACHED_SIZE		(1 << MAX_CACHED_SIZE_BITS)
-#define MAX_PREFETCH_SIZE_BITS	(CACHE_SIZE_BITS - 6)
-#define MAX_PREFETCH_SIZE	(1 << MAX_PREFETCH_SIZE_BITS)
-#define NO_PREFETCH		(1)
 
 /*
  * Binning is done as a subdivided buddy allocator.  A buddy allocator
@@ -128,7 +116,8 @@ static inline int is_accessed(struct thread_cache *cache, int bin)
 /*
  * Free objects from the atomic_free_list while holding the
  * arena_lock.  In case the atomic_free_list has become obscenely big
- * we limit ourselves to freeing 64 objects at once.
+ * we limit ourselves to freeing 64 objects at once.  List sizes
+ * above 7000 elements have been observed in tests, so this matters.
  */
 static void free_atomic_list(struct malloc_state *arena)
 {
@@ -261,9 +250,9 @@ static void *tcache_malloc(size_t size)
 {
 	struct thread_cache *cache;
 	struct malloc_state *arena;
-	struct malloc_chunk **bin, *victim, *prefetch;
+	struct malloc_chunk **bin, *victim;
 	size_t nb;
-	int bin_no, i;
+	int bin_no;
 
 	checked_request2size(size, nb);
 	if (nb > MAX_CACHED_SIZE)
@@ -309,55 +298,8 @@ static void *tcache_malloc(size_t size)
 		mutex_unlock(&cache->mutex);
 		return p;
 	}
-
-	/*
-	 * GC the cache before prefetching, not after.  The last thing
-	 * we want is to spend effort prefetching, then release all
-	 * those objects via tcache_gc.  Also do it before taking the
-	 * lock, to minimize hold times.
-	 */
-	if (nb <= MAX_PREFETCH_SIZE && (cache->tc_size + nb * NO_PREFETCH) > CACHE_SIZE)
-		tcache_gc(cache);
-
-	arena = arena_get(size);
-	if (!arena) {
-		mutex_unlock(&cache->mutex);
-		return NULL;
-	}
-	free_atomic_list(arena);
-	/* TODO: _int_malloc does checked_request2size() again, which is silly */
-	victim = _int_malloc(arena, size);
-	if (!victim) {
-		arena = get_backup_arena(arena, size);
-		victim = _int_malloc(arena, size);
-	}
-	if (victim && nb <= MAX_PREFETCH_SIZE) {
-		/* Prefetch some more while we hold the lock */
-		for (i = 0; i < NO_PREFETCH; i++) {
-			prefetch = _int_malloc(arena, size);
-			if (!prefetch)
-				break;
-			prefetch = mem2chunk(prefetch);
-			if (cache_bin(chunksize(prefetch)) > bin_no) {
-				/*
-				 * If _int_malloc() returns bigger chunks,
-				 * we assume that prefetching won't buy us
-				 * any benefits.
-				 */
-				_int_free(arena, prefetch);
-				break;
-			}
-			assert(cache_bin(chunksize(prefetch)) == bin_no);
-			cache->tc_size += chunksize(prefetch);
-			cache->tc_count++;
-			add_to_bin(bin, prefetch);
-		}
-	}
-	arena_unlock(arena);
-	assert(!victim || arena == arena_for_chunk(mem2chunk(victim)));
-	alloc_perturb(victim, size);
 	mutex_unlock(&cache->mutex);
-	return victim;
+	return NULL;
 }
 
 static void tcache_free(mchunkptr p)
-- 
2.7.0.rc3


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]