[PATCH] libdw: add thread-safety to dwarf_getabbrev()

Sun Aug 25 01:11:00 GMT 2019

On Sat, Aug 24, 2019 at 6:24 PM, Mark Wielaard <mark@klomp.org> wrote:
> Hi,
> 
> On Fri, Aug 16, 2019 at 02:24:28PM -0500, Jonathon Anderson wrote:
>>  2. Adding a lock & array structure to the memory manager 
>> (pseudo-TLS)
>>    (libdwP.h, libdw_alloc.c)
> 
> Specifically concentrating on this part.
> 
>>  diff --git a/libdw/ChangeLog b/libdw/ChangeLog
>>  index bf1f4857..87abf7a7 100644
>>  --- a/libdw/ChangeLog
>>  +++ b/libdw/ChangeLog
>>  @@ -1,3 +1,12 @@
>>  +2019-08-15  Jonathon Anderson  <jma14@rice.edu>
>>  +
>>  +	* libdw_alloc.c (__libdw_allocate): Added thread-safe stack 
>> allocator.
>>  +	* libdwP.h (Dwarf): Likewise.
>>  +	* dwarf_begin_elf.c (dwarf_begin_elf): Support for above.
>>  +	* dwarf_end.c (dwarf_end): Likewise.
>>  [...]
>>  +	* Makefile.am: Link -lpthread to provide rwlocks.
> 
>>  diff --git a/libdw/Makefile.am b/libdw/Makefile.am
>>  index 7a3d5322..6d0a0187 100644
>>  --- a/libdw/Makefile.am
>>  +++ b/libdw/Makefile.am
>>  @@ -108,7 +108,7 @@ am_libdw_pic_a_OBJECTS = 
>> $(libdw_a_SOURCES:.c=.os)
>>  libdw_so_LIBS = libdw_pic.a ../libdwelf/libdwelf_pic.a \
>>  	  ../libdwfl/libdwfl_pic.a ../libebl/libebl.a
>>  libdw_so_DEPS = ../lib/libeu.a ../libelf/libelf.so
>>  -libdw_so_LDLIBS = $(libdw_so_DEPS) -ldl -lz $(argp_LDADD) 
>> $(zip_LIBS)
>>  +libdw_so_LDLIBS = $(libdw_so_DEPS) -ldl -lz $(argp_LDADD) 
>> $(zip_LIBS)
>>  -lpthread
>>  libdw_so_SOURCES =
>>  libdw.so$(EXEEXT): $(srcdir)/libdw.map $(libdw_so_LIBS) 
>> $(libdw_so_DEPS)
>>  # The rpath is necessary for libebl because its $ORIGIN use will
> 
> Do we also need -pthread for CFLAGS?

Yep, that would be the smart thing to do. Will be handled in the next 
version of the patch (once I figure out how to autotools...).

> 
>>  diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
>>  index 38c8f5c6..b3885bb5 100644
>>  --- a/libdw/dwarf_begin_elf.c
>>  +++ b/libdw/dwarf_begin_elf.c
>>  @@ -417,11 +417,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd, 
>> Elf_Scn
>>  *scngrp)
>>    /* Initialize the memory handling.  */
>>    result->mem_default_size = mem_default_size;
>>    result->oom_handler = __libdw_oom;
>>  -  result->mem_tail = (struct libdw_memblock *) (result + 1);
>>  -  result->mem_tail->size = (result->mem_default_size
>>  -			    - offsetof (struct libdw_memblock, mem));
>>  -  result->mem_tail->remaining = result->mem_tail->size;
>>  -  result->mem_tail->prev = NULL;
>>  +  pthread_rwlock_init(&result->mem_rwl, NULL);
>>  +  result->mem_stacks = 1;
>>  +  result->mem_tails = malloc (sizeof (struct libdw_memblock *));
>>  +  result->mem_tails[0] = (struct libdw_memblock *) (result + 1);
>>  +  result->mem_tails[0]->size = (result->mem_default_size
>>  +			       - offsetof (struct libdw_memblock, mem));
>>  +  result->mem_tails[0]->remaining = result->mem_tails[0]->size;
>>  +  result->mem_tails[0]->prev = NULL;
> 
> Can't we just set mem_tails[0] = NULL?
> Wouldn't __libdw_alloc_tail () then initialize it?

Mostly an artifact of preserving the previous behavior for 
single-threaded programs, the malloc for the Dwarf then simplifies too 
(and less memory usage for opened but otherwise unused Dwarfs). 
Downside is an extra malloc on first usage (which shouldn't be an issue 
for any real program).

> 
>>  diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
>>  index 29795c10..6317bcda 100644
>>  --- a/libdw/dwarf_end.c
>>  +++ b/libdw/dwarf_end.c
>>  @@ -94,14 +94,22 @@ dwarf_end (Dwarf *dwarf)
>>        /* And the split Dwarf.  */
>>        tdestroy (dwarf->split_tree, noop_free);
>> 
>>  -      struct libdw_memblock *memp = dwarf->mem_tail;
>>  -      /* The first block is allocated together with the Dwarf 
>> object.  */
>>  -      while (memp->prev != NULL)
>>  -	{
>>  -	  struct libdw_memblock *prevp = memp->prev;
>>  -	  free (memp);
>>  -	  memp = prevp;
>>  -	}
>>  +      for (size_t i = 0; i < dwarf->mem_stacks; i++)
>>  +        {
>>  +          struct libdw_memblock *memp = dwarf->mem_tails[i];
>>  +          /* The first block is allocated together with the Dwarf 
>> object.
>>  */
> 
> Then this wouldn't be true again.
> 
>>  +          while (memp != NULL && memp->prev != NULL)
>>  +	    {
>>  +	      struct libdw_memblock *prevp = memp->prev;
>>  +	      free (memp);
>>  +	      memp = prevp;
>>  +	    }
>>  +          /* Only for stack 0 though, the others are allocated
>>  individually.  */
>>  +          if (memp != NULL && i > 0)
>>  +            free (memp);
>>  +        }
> 
> And then you don't need this special case.
> 
>>  +      free (dwarf->mem_tails);
>>  +      pthread_rwlock_destroy (&dwarf->mem_rwl);
>> 
>>        /* Free the pubnames helper structure.  */
>>        free (dwarf->pubnames_sets);
>>  diff --git a/libdw/libdwP.h b/libdw/libdwP.h
>>  index eebb7d12..442d493d 100644
>>  --- a/libdw/libdwP.h
>>  +++ b/libdw/libdwP.h
>>  @@ -31,6 +31,7 @@
>> 
>>  #include <libintl.h>
>>  #include <stdbool.h>
>>  +#include <pthread.h>
>> 
>>  #include <libdw.h>
>>  #include <dwarf.h>
>>  @@ -218,16 +219,18 @@ struct Dwarf
>>    /* Similar for addrx/constx, which will come from .debug_addr 
>> section.  */
>>    struct Dwarf_CU *fake_addr_cu;
>> 
>>  -  /* Internal memory handling.  This is basically a simplified
>>  +  /* Internal memory handling.  This is basically a simplified 
>> thread-local
>>       reimplementation of obstacks.  Unfortunately the standard 
>> obstack
>>       implementation is not usable in libraries.  */
>>  +  pthread_rwlock_t mem_rwl;
>>  +  size_t mem_stacks;
>>    struct libdw_memblock
>>    {
>>      size_t size;
>>      size_t remaining;
>>      struct libdw_memblock *prev;
>>      char mem[0];
>>  -  } *mem_tail;
>>  +  } **mem_tails;
> 
> Please document what/how exactly mem_rwl protects which other fields.

Got it.

> 
>>    /* Default size of allocated memory blocks.  */
>>    size_t mem_default_size;
>>  @@ -572,7 +575,7 @@ extern void __libdw_seterrno (int value)
>>  internal_function;
>> 
>>  /* Memory handling, the easy parts.  This macro does not do any 
>> locking.  */
> 
> This comment isn't true anymore. __libdw_alloc_tail does locking.

Got it.

> Also I think the locking should be extended beyond just that
> functions, see below.
> 
>>  #define libdw_alloc(dbg, type, tsize, cnt) \
>>  -  ({ struct libdw_memblock *_tail = (dbg)->mem_tail;			      \
>>  +  ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg);		     
>>  \
>>       size_t _required = (tsize) * (cnt);				      \
>>       type *_result = (type *) (_tail->mem + (_tail->size -
>>  _tail->remaining));\
>>       size_t _padding = ((__alignof (type)				      \
>>  @@ -591,6 +594,10 @@ extern void __libdw_seterrno (int value)
>>  internal_function;
>>  #define libdw_typed_alloc(dbg, type) \
>>    libdw_alloc (dbg, type, sizeof (type), 1)
>> 
>>  +/* Callback to choose a thread-local memory allocation stack.  */
>>  +extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
>>  +     __nonnull_attribute__ (1);
>>  +
>>  /* Callback to allocate more.  */
>>  extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_t 
>> align)
>>       __attribute__ ((__malloc__)) __nonnull_attribute__ (1);
>>  diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
>>  index f1e08714..c3c5e8a7 100644
>>  --- a/libdw/libdw_alloc.c
>>  +++ b/libdw/libdw_alloc.c
>>  @@ -33,9 +33,73 @@
>> 
>>  #include <errno.h>
>>  #include <stdlib.h>
>>  +#include <assert.h>
>>  #include "libdwP.h"
>>  #include "system.h"
>>  +#include "stdatomic.h"
>>  +#if USE_VG_ANNOTATIONS == 1
>>  +#include <helgrind.h>
>>  +#include <drd.h>
>>  +#else
>>  +#define ANNOTATE_HAPPENS_BEFORE(X)
>>  +#define ANNOTATE_HAPPENS_AFTER(X)
>>  +#endif
>>  +
>>  +
>>  +#define thread_local __thread
>> 
>>  +static thread_local int initialized = 0;
>>  +static thread_local size_t thread_id;
>>  +static atomic_size_t next_id = ATOMIC_VAR_INIT(0);
> 
> Could you initialize thread_id to (size_t) -1?
> Then you don't need another thread_local to indicate not initialized?

Just a habit, easy enough to change for the next version.

> 
>>  +struct libdw_memblock *
>>  +__libdw_alloc_tail (Dwarf *dbg)
>>  +{
>>  +  if (!initialized)
> 
> And change this to thead_id == (size_t) -1?
> 
>>  +    {
>>  +      thread_id = atomic_fetch_add (&next_id, 1);
>>  +      initialized = 1;
>>  +    }
>>  +
>>  +  pthread_rwlock_rdlock (&dbg->mem_rwl);
>>  +  if (thread_id >= dbg->mem_stacks)
>>  +    {
>>  +      pthread_rwlock_unlock (&dbg->mem_rwl);
>>  +      pthread_rwlock_wrlock (&dbg->mem_rwl);
>>  +
>>  +      /* Another thread may have already reallocated. In theory 
>> using an
>>  +         atomic would be faster, but given that this only happens 
>> once per
>>  +         thread per Dwarf, some minor slowdown should be fine.  */
>>  +      if (thread_id >= dbg->mem_stacks)
>>  +        {
>>  +          dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
>>  +                                    * sizeof (struct 
>> libdw_memblock *));
>>  +          assert(dbg->mem_tails);
> 
> Don't use assert here, use if (dbg->mem_tails == NULL) 
> dbg->oom_handler ();

Same as __libdw_allocate, got it. Apologies for not noticing sooner.

> 
> But what if realloc moves the block?
> Then all dbg->mem_tails[thread_id] pointers become invalid.
> And this function drops the lock before returning a libdw_memblock*.

Not quite, dbg->mem_tails is an array of pointers (note the ** in its 
definition, and the use of the plural "tails"). So while the 
dbg->mem_tails pointer becomes invalid, the dbg->mem_tails[thread_id] 
pointers don't.

It would be a different story if dbg->mem_tails was an array of 
libdw_memblocks, but its not. That would increase the "memory leak" 
issue mentioned previously (to ~4K per dead thread) and require more 
work on the part of the reallocing thread to initialize the new entries 
(which at the moment should reduce to a memset, assuming the compiler 
is smart enough and NULL == 0).

> 
> So I think the lock needs to extend beyond this function somehow.  Or
> we need to prevent another thread reallocing while we are dealing with
> the assigned memblock.

No need, in fact we want to drop the lock as soon as possible to let 
new threads in for realloc's. Under the assumption that we don't need 
to allocate new blocks (extremely) often, the extra cost to relock when 
we do should be relatively small. Also see __libdw_allocate.

> 
>>  +          for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
>>  +            dbg->mem_tails[i] = NULL;
>>  +          dbg->mem_stacks = thread_id + 1;
>>  +          ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
>>  +        }
>>  +
>>  +      pthread_rwlock_unlock (&dbg->mem_rwl);
>>  +      pthread_rwlock_rdlock (&dbg->mem_rwl);
>>  +    }
>>  +
>>  +  /* At this point, we have an entry in the tail array.  */
>>  +  ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
>>  +  struct libdw_memblock *result = dbg->mem_tails[thread_id];
>>  +  if (result == NULL)
>>  +    {
>>  +      result = malloc (dbg->mem_default_size);
>>  +      result->size = dbg->mem_default_size
>>  +                     - offsetof (struct libdw_memblock, mem);
>>  +      result->remaining = result->size;
>>  +      result->prev = NULL;
>>  +      dbg->mem_tails[thread_id] = result;
>>  +    }
>>  +  pthread_rwlock_unlock (&dbg->mem_rwl);
>>  +  return result;
> 
> Here the lock is dropped and result points into dbg->mem_tails
> which means it cannot be used because the realloc above might
> make it invalid.

result points to a (potentially newly created) memory block, accessible 
only by this thread and never realloc'd.

>>  +}
>> 
>>  void *
>>  __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
>>  @@ -52,8 +116,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize, 
>> size_t
>>  align)
>>    newp->size = size - offsetof (struct libdw_memblock, mem);
>>    newp->remaining = (uintptr_t) newp + size - (result + minsize);
>> 
>>  -  newp->prev = dbg->mem_tail;
>>  -  dbg->mem_tail = newp;
>>  +  pthread_rwlock_rdlock (&dbg->mem_rwl);
>>  +  newp->prev = dbg->mem_tails[thread_id];
>>  +  dbg->mem_tails[thread_id] = newp;
>>  +  pthread_rwlock_unlock (&dbg->mem_rwl);
>> 

Here we tinker with dbg->mem_tails under a read lock, which is mutually 
excluded from the write lock held during realloc. The new block belongs 
to this thread alone, so it doesn't need any locks to manage its usage.

>> 
>>    return (void *) result;
>>  }
> 
> Cheers,
> 
> Mark

As mentioned in other mails, this management scheme isn't (always) 
optimally memory efficient, but its speed is good under parallel 
stress. Far better than wrapping the whole thing with a single 
pthread_mutex_t.

-Jonathon