This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC/PoC] malloc: use wfcqueue to speed up remote frees


The goal is to reduce contention and improve locality of cross-thread
malloc/free traffic common to IPC systems (including Userspace-RCU) and
some garbage-collected runtimes.

Very rough benchmarks using `xthr`[1], a small URCU test program
I wrote years ago shows huge improvements in time and space:

  $ /usr/bin/time ./before.sh ./xthr -a 2 -m 2500 -i $((1024 * 1024 * 5))
  2.46user 3.51system 0:05.50elapsed 108%CPU (0avgtext+0avgdata 3352592maxresident)k
  0inputs+0outputs (17major+838014minor)pagefaults 0swaps

  $ /usr/bin/time ./after.sh ./xthr -a 2 -m 2500 -i $((1024 * 1024 * 5))
  2.68user 0.48system 0:02.55elapsed 123%CPU (0avgtext+0avgdata 532904maxresident)k
  0inputs+0outputs (0major+174304minor)pagefaults 0swaps

Where before.sh and after.sh are script wrappers around ld-linux for
the appropriate glibc installation.

  #!/bin/sh
  exec /tmp/6/lib/ld-linux-x86-64.so.2 --library-path /tmp/6/lib "$@"

  [1] xthr.c: https://80x24.org/spew/20180731082205.vykyunsm5xg7ml3e@dcvr/

It avoids lock contention by only deferring `_int_free' to scenarios
where the arena lock is already acquired.  Three functions are added:

* remote_free_begin  - Producer - enqueues the allocation into an arena
                       designated to another thread.  This is wait-free,
                       branchless, and only modifies the last (cold)
                       cacheline of the arena belonging to another thread

* remote_free_step   - Consumer - grabs everything enqueued by
                       remote_free_begin and calls `_int_free' locally
                       without acquiring extra locks.  Returns `true'
                       if it did any work, as other threads may have
                       called `remote_free_begin' in the meantime.

* remote_free_finish - Consumer - calls remote_free_step in a loop until
                       there is no more work to do.  It runs before most
                       calls to malloc_consolidate.

wfcqueue is the LGPL-2.1+ Wait-Free Concurrent Queue distributed
with Userspace-RCU <http://liburcu.org/>.  wfcqueue does not
depend on RCU itself (only atomics), but forms the basis of the
workqueue and call_rcu primitive within liburcu.

The functions I'm using from wfcqueue can be statically-linked
from header files, so it involves no extra linkage at runtime.
Note: Debian users can `apt-get install liburcu-dev' to get
wfcqueue.h; I expect it's available for other distros.

If this proof-of-concept is found acceptable, I can work on
making wfcqueue use the atomics provided by gcc/glibc instead
of the `uatomic` headers of URCU so it can be bundled with
glibc.  But maybe adding liburcu as a build-time dependency
is acceptable :)

Note: I'm haven't gotten "make -j4 check" even close to passing even
without this patch on commit 98864ed0e055583707e37cdb7d41a9cdeac4473b.
It's likely a problem on my end; but I'm only a fairly
common Debian 9 x86-64 system; though I haven't built glibc in years.

On the other hand, with the exception of fiddle (dl-dependent)
and tz tests, most of the "test-all" suite passes for Ruby
when using the either the before.sh or after.sh glibc wrapper
(but I haven't done much testing otherwise):

    make test-all TESTS='-x fiddle -x time_tz' \
		RUNRUBYOPT=--precommand=/path/to/after.sh
---
 malloc/malloc.c | 108 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 103 insertions(+), 5 deletions(-)

diff --git a/malloc/malloc.c b/malloc/malloc.c
index e247c77b7d..40d61e45db 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -247,6 +247,9 @@
 /* For SINGLE_THREAD_P.  */
 #include <sysdep-cancel.h>
 
+#define _LGPL_SOURCE /* allows inlines */
+#include <urcu/wfcqueue.h>
+
 /*
   Debugging:
 
@@ -1660,6 +1663,9 @@ struct malloc_state
   /* Serialize access.  */
   __libc_lock_define (, mutex);
 
+  /* Only the owner of this arena writes to the head */
+  struct __cds_wfcq_head remote_free_head;
+
   /* Flags (formerly in max_fast).  */
   int flags;
 
@@ -1697,6 +1703,11 @@ struct malloc_state
   /* Memory allocated from the system in this arena.  */
   INTERNAL_SIZE_T system_mem;
   INTERNAL_SIZE_T max_system_mem;
+
+  /* remote_free_tail is written to by a thread other than the owner of
+     this arena, so we want this on a different cacheline than
+     remote_free_head */
+  struct cds_wfcq_tail remote_free_tail;
 };
 
 struct malloc_par
@@ -1794,6 +1805,7 @@ malloc_init_state (mstate av)
   int i;
   mbinptr bin;
 
+  __cds_wfcq_init (&av->remote_free_head, &av->remote_free_tail);
   /* Establish circular links for normal bins */
   for (i = 1; i < NBINS; ++i)
     {
@@ -3007,6 +3019,67 @@ tcache_thread_shutdown (void)
 
 #endif /* !USE_TCACHE  */
 
+_Static_assert (MINSIZE >= sizeof(struct cds_wfcq_node),
+                "struct cds_wfcq_node too big");
+/* wait-free enqueue to the remote arena */
+static void
+remote_free_begin (mstate av, void *mem)
+{
+  struct cds_wfcq_node *node = mem;
+
+  cds_wfcq_node_init (node);
+  cds_wfcq_enqueue (&av->remote_free_head, &av->remote_free_tail, node);
+  /* other thread calls remote_free_step */
+}
+
+/*
+ * process remote free queue, must have locked av
+ * returns true if it did anything
+ */
+static bool
+remote_free_step (mstate av)
+{
+  struct cds_wfcq_node *node, *n;
+  struct __cds_wfcq_head tmp_head;
+  struct cds_wfcq_tail tmp_tail;
+  enum cds_wfcq_ret ret;
+
+  if (__glibc_unlikely (av == NULL))
+    {
+      return false;
+    }
+
+  __cds_wfcq_init (&tmp_head, &tmp_tail);
+  ret = __cds_wfcq_splice_nonblocking (&tmp_head, &tmp_tail,
+				       &av->remote_free_head,
+				       &av->remote_free_tail);
+
+  if (__glibc_unlikely (ret == CDS_WFCQ_RET_DEST_EMPTY))
+    {
+      MAYBE_INIT_TCACHE ();
+      __cds_wfcq_for_each_blocking_safe (&tmp_head, &tmp_tail, node, n)
+        {
+          _int_free (av, mem2chunk(node), 1);
+        }
+
+      /*
+       * tell caller we did some work, and it's possible other threads
+       * to enqueued more work for us while we were busy
+       */
+      return true;
+    }
+
+  assert (ret != CDS_WFCQ_RET_DEST_NON_EMPTY);
+
+  return false; /* did nothing */
+}
+
+static void
+remote_free_finish (mstate av)
+{
+  while (remote_free_step (av)) ;
+}
+
 void *
 __libc_malloc (size_t bytes)
 {
@@ -3045,6 +3118,7 @@ __libc_malloc (size_t bytes)
     }
 
   arena_get (ar_ptr, bytes);
+  remote_free_step (ar_ptr);
 
   victim = _int_malloc (ar_ptr, bytes);
   /* Retry with another arena only if we were able to find a usable arena
@@ -3053,6 +3127,7 @@ __libc_malloc (size_t bytes)
     {
       LIBC_PROBE (memory_malloc_retry, 1, bytes);
       ar_ptr = arena_get_retry (ar_ptr, bytes);
+      remote_free_step (ar_ptr);
       victim = _int_malloc (ar_ptr, bytes);
     }
 
@@ -3102,10 +3177,16 @@ __libc_free (void *mem)
       return;
     }
 
-  MAYBE_INIT_TCACHE ();
-
   ar_ptr = arena_for_chunk (p);
-  _int_free (ar_ptr, p, 0);
+  if (thread_arena == ar_ptr)	/* thread_arena may be NULL */
+    {
+      MAYBE_INIT_TCACHE (); /* XXX is this needed if thread_arena == ar_ptr? */
+      _int_free (ar_ptr, p, 0);
+    }
+  else
+    {
+      remote_free_begin (ar_ptr, mem);
+    }
 }
 libc_hidden_def (__libc_free)
 
@@ -3211,6 +3292,8 @@ __libc_realloc (void *oldmem, size_t bytes)
 
   __libc_lock_lock (ar_ptr->mutex);
 
+  remote_free_step (ar_ptr);
+
   newp = _int_realloc (ar_ptr, oldp, oldsize, nb);
 
   __libc_lock_unlock (ar_ptr->mutex);
@@ -3225,7 +3308,14 @@ __libc_realloc (void *oldmem, size_t bytes)
       if (newp != NULL)
         {
           memcpy (newp, oldmem, oldsize - SIZE_SZ);
-          _int_free (ar_ptr, oldp, 0);
+          if (thread_arena == ar_ptr)
+            {
+              _int_free (ar_ptr, oldp, 0);
+            }
+          else  /* don't lock again */
+            {
+              remote_free_begin (ar_ptr, oldmem);
+            }
         }
     }
 
@@ -3294,12 +3384,14 @@ _mid_memalign (size_t alignment, size_t bytes, void *address)
     }
 
   arena_get (ar_ptr, bytes + alignment + MINSIZE);
+  remote_free_step(ar_ptr);
 
   p = _int_memalign (ar_ptr, alignment, bytes);
   if (!p && ar_ptr != NULL)
     {
       LIBC_PROBE (memory_memalign_retry, 2, bytes, alignment);
       ar_ptr = arena_get_retry (ar_ptr, bytes);
+      remote_free_step(ar_ptr);
       p = _int_memalign (ar_ptr, alignment, bytes);
     }
 
@@ -3388,7 +3480,10 @@ __libc_calloc (size_t n, size_t elem_size)
   if (SINGLE_THREAD_P)
     av = &main_arena;
   else
-    arena_get (av, sz);
+    {
+      arena_get (av, sz);
+      remote_free_step (av);
+    }
 
   if (av)
     {
@@ -3428,6 +3523,7 @@ __libc_calloc (size_t n, size_t elem_size)
 	{
 	  LIBC_PROBE (memory_calloc_retry, 1, sz);
 	  av = arena_get_retry (av, sz);
+	  remote_free_step(av);
 	  mem = _int_malloc (av, sz);
 	}
 
@@ -4750,6 +4846,7 @@ static int
 mtrim (mstate av, size_t pad)
 {
   /* Ensure all blocks are consolidated.  */
+  remote_free_finish (av);
   malloc_consolidate (av);
 
   const size_t ps = GLRO (dl_pagesize);
@@ -5133,6 +5230,7 @@ __libc_mallopt (int param_number, int value)
 
   /* We must consolidate main arena before changing max_fast
      (see definition of set_max_fast).  */
+  remote_free_finish (av);
   malloc_consolidate (av);
 
   switch (param_number)
-- 
EW


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]