[PATCH 2/2] libdw: Rewrite the memory handler to be more robust.

Thu Nov 7 18:13:00 GMT 2019

On Thu, Nov 7, 2019 at 18:20, Mark Wielaard <mark@klomp.org> wrote:
> Hi Jonathan,
> 
> On Tue, 2019-10-29 at 22:14 +0100, Mark Wielaard wrote:
>>  Pthread's thread-local variables are highly limited, which makes
>>  it difficult to use many Dwarfs. This replaces that with a
>>  less efficient (or elegant) but more robust method.
> 
> Thanks, it looks good (similar to a variant I already reviewed
> earlier). Some small comments below.
> 
> I haven't benchmarked this version. Did you do any?
> 
>>  diff --git a/lib/atomics.h b/lib/atomics.h
>>  index ffd12f87..e3773759 100644
>>  --- a/lib/atomics.h
>>  +++ b/lib/atomics.h
>>  @@ -31,7 +31,9 @@
>>   #if HAVE_STDATOMIC_H
>>   /* If possible, use the compiler's preferred atomics.  */
>>   # include <stdatomic.h>
>>  +# include <threads.h>
>>   #else
>>   /* Otherwise, try to use the builtins provided by this compiler.  
>> */
>>   # include "stdatomic-fbsd.h"
>>  +# define thread_local __thread
>>   #endif /* HAVE_STDATOMIC_H */
> 
> Do we really need this?
> We already use __thread unconditionally in the rest of the code.
> The usage of threads.h seems to imply we actually want C11
> _Thread_local. Is that what you really want, or can we just use
> __thread in libdw_alloc.c for thread_id?

We don't really need it, I just got in the habit of writing 
thread_local (and, proper C11 compat). __thread is perfectly fine for 
thread_id.

> 
>>  diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
>>  index 8d137414..eadff13b 100644
>>  --- a/libdw/dwarf_begin_elf.c
>>  +++ b/libdw/dwarf_begin_elf.c
>>  @@ -418,13 +418,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd, 
>> Elf_Scn *scngrp)
>>        actual allocation.  */
>>     result->mem_default_size = mem_default_size;
>>     result->oom_handler = __libdw_oom;
>>  -  if (pthread_key_create (&result->mem_key, NULL) != 0)
>>  +  if (pthread_rwlock_init(&result->mem_rwl, NULL) != 0)
>>       {
>>         free (result);
>>  -      __libdw_seterrno (DWARF_E_NOMEM); /* no memory or max 
>> pthread keys.  */
>>  +      __libdw_seterrno (DWARF_E_NOMEM); /* no memory.  */
>>         return NULL;
>>       }
>>  -  atomic_init (&result->mem_tail, (uintptr_t)NULL);
>>  +  result->mem_stacks = 0;
>>  +  result->mem_tails = NULL;
>> 
>>     if (cmd == DWARF_C_READ || cmd == DWARF_C_RDWR)
>>       {
> 
> OK.
> 
>>  diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
>>  index a2e94436..3fd2836d 100644
>>  --- a/libdw/dwarf_end.c
>>  +++ b/libdw/dwarf_end.c
>>  @@ -95,17 +95,19 @@ dwarf_end (Dwarf *dwarf)
>>         tdestroy (dwarf->split_tree, noop_free);
>> 
>>         /* Free the internally allocated memory.  */
>>  -      struct libdw_memblock *memp;
>>  -      memp = (struct libdw_memblock *) (atomic_load_explicit
>>  -					(&dwarf->mem_tail,
>>  -					 memory_order_relaxed));
>>  -      while (memp != NULL)
>>  -	{
>>  -	  struct libdw_memblock *prevp = memp->prev;
>>  -	  free (memp);
>>  -	  memp = prevp;
>>  -	}
>>  -      pthread_key_delete (dwarf->mem_key);
>>  +      for (size_t i = 0; i < dwarf->mem_stacks; i++)
>>  +        {
>>  +          struct libdw_memblock *memp = dwarf->mem_tails[i];
>>  +          while (memp != NULL)
>>  +	    {
>>  +	      struct libdw_memblock *prevp = memp->prev;
>>  +	      free (memp);
>>  +	      memp = prevp;
>>  +	    }
>>  +        }
>>  +      if (dwarf->mem_tails != NULL)
>>  +        free (dwarf->mem_tails);
>>  +      pthread_rwlock_destroy (&dwarf->mem_rwl);
>> 
>>         /* Free the pubnames helper structure.  */
>>         free (dwarf->pubnames_sets);
> 
> OK. Might it be an idea to have some call to reset next_id (see 
> below)?

Generally not a good idea (see below).

> 
>>  diff --git a/libdw/libdwP.h b/libdw/libdwP.h
>>  index ad2599eb..3e1ef59b 100644
>>  --- a/libdw/libdwP.h
>>  +++ b/libdw/libdwP.h
>>  @@ -149,17 +149,6 @@ enum
>> 
>>   #include "dwarf_sig8_hash.h"
>> 
>>  -/* Structure for internal memory handling.  This is basically a 
>> simplified
>>  -   reimplementation of obstacks.  Unfortunately the standard 
>> obstack
>>  -   implementation is not usable in libraries.  */
>>  -struct libdw_memblock
>>  -{
>>  -  size_t size;
>>  -  size_t remaining;
>>  -  struct libdw_memblock *prev;
>>  -  char mem[0];
>>  -};
>>  -
>>   /* This is the structure representing the debugging state.  */
>>   struct Dwarf
>>   {
>>  @@ -231,11 +220,22 @@ struct Dwarf
>>     /* Similar for addrx/constx, which will come from .debug_addr 
>> section.  */
>>     struct Dwarf_CU *fake_addr_cu;
>> 
>>  -  /* Internal memory handling.  Each thread allocates separately 
>> and only
>>  -     allocates from its own blocks, while all the blocks are 
>> pushed atomically
>>  -     onto a unified stack for easy deallocation.  */
>>  -  pthread_key_t mem_key;
>>  -  atomic_uintptr_t mem_tail;
>>  +  /* Supporting lock for internal memory handling.  Ensures 
>> threads that have
>>  +     an entry in the mem_tails array are not disturbed by new 
>> threads doing
>>  +     allocations for this Dwarf.  */
>>  +  pthread_rwlock_t mem_rwl;
>>  +
>>  +  /* Internal memory handling.  This is basically a simplified 
>> thread-local
>>  +     reimplementation of obstacks.  Unfortunately the standard 
>> obstack
>>  +     implementation is not usable in libraries.  */
>>  +  size_t mem_stacks;
>>  +  struct libdw_memblock
>>  +  {
>>  +    size_t size;
>>  +    size_t remaining;
>>  +    struct libdw_memblock *prev;
>>  +    char mem[0];
>>  +  } **mem_tails;
>> 
>>     /* Default size of allocated memory blocks.  */
>>     size_t mem_default_size;
> 
> OK.
> 
>>  @@ -578,34 +578,31 @@ libdw_valid_user_form (int form)
>>   extern void __libdw_seterrno (int value) internal_function;
>> 
>> 
>>  -/* Memory handling, the easy parts.  This macro does not do nor 
>> need to do any
>>  -   locking for proper concurrent operation.  */
>>  +/* Memory handling, the easy parts.  */
>>   #define libdw_alloc(dbg, type, tsize, cnt) \
>>  -  ({ struct libdw_memblock *_tail = pthread_getspecific 
>> (dbg->mem_key);       \
>>  -     size_t _req = (tsize) * (cnt);					      \
>>  -     type *_result;							      \
>>  -     if (unlikely (_tail == NULL))					      \
>>  -       _result = (type *) __libdw_allocate (dbg, _req, __alignof 
>> (type));     \
>>  +  ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg);		     
>>  \
>>  +     size_t _required = (tsize) * (cnt);				      \
>>  +     type *_result = (type *) (_tail->mem + (_tail->size - 
>> _tail->remaining));\
>>  +     size_t _padding = ((__alignof (type)				      \
>>  +			 - ((uintptr_t) _result & (__alignof (type) - 1)))    \
>>  +			& (__alignof (type) - 1));			      \
>>  +     if (unlikely (_tail->remaining < _required + _padding))		     
>>  \
>>  +       _result = (type *) __libdw_allocate (dbg, _required, 
>> __alignof (type));\
>>        else								      \
>>          {								      \
>>  -	 _result = (type *) (_tail->mem + (_tail->size - 
>> _tail->remaining));  \
>>  -	 size_t _padding = ((__alignof (type)				      \
>>  -			    - ((uintptr_t) _result & (__alignof (type) - 1))) \
>>  -			       & (__alignof (type) - 1));		      \
>>  -	 if (unlikely (_tail->remaining < _req + _padding))		      \
>>  -	   _result = (type *) __libdw_allocate (dbg, _req, __alignof 
>> (type)); \
>>  -	 else								      \
>>  -	   {								      \
>>  -	     _req += _padding;						      \
>>  -	     _result = (type *) ((char *) _result + _padding);		      \
>>  -	     _tail->remaining -= _req;					      \
>>  -	   }								      \
>>  +	 _required += _padding;						      \
>>  +	 _result = (type *) ((char *) _result + _padding);		      \
>>  +	 _tail->remaining -= _required;					      \
>>          }								      \
>>        _result; })
> 
> OK. Maybe add a comment that __libdw_alloc_tail makes sure that we get
> a thread local libdw_memblock to work with.
> 
>>   #define libdw_typed_alloc(dbg, type) \
>>     libdw_alloc (dbg, type, sizeof (type), 1)
>> 
>>  +/* Callback to choose a thread-local memory allocation stack.  */
>>  +extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
>>  +     __nonnull_attribute__ (1);
>>  +
>>   /* Callback to allocate more.  */
>>   extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_t 
>> align)
>>        __attribute__ ((__malloc__)) __nonnull_attribute__ (1);
> 
> O. There is the comment already :)
> 
>>  diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
>>  index f2e74d18..86ca7032 100644
>>  --- a/libdw/libdw_alloc.c
>>  +++ b/libdw/libdw_alloc.c
>>  @@ -35,7 +35,68 @@
>>   #include <stdlib.h>
>>   #include "libdwP.h"
>>   #include "system.h"
>>  +#include "atomics.h"
>>  +#if USE_VG_ANNOTATIONS == 1
>>  +#include <helgrind.h>
>>  +#include <drd.h>
> 
> I think if you include helgrind.h you won't get the drd.h
> ANNOTATE_HAPPENS_BEFORE/AFTER. So do you also need to include drd.h?

Not really, just another habit. Since this is file only needs HAPPENS_* 
helgrind.h is sufficient.

> 
>>  +#else
>>  +#define ANNOTATE_HAPPENS_BEFORE(X)
>>  +#define ANNOTATE_HAPPENS_AFTER(X)
>>  +#endif
> 
> Could you explain the usage of the happens_before/after annotations in
> this code. I must admit that I don't fully understand why/how it works
> in this case. Specifically since realloc might change the address that
> mem_tails points to.

Reader-writer locks ensure no "readers" are present whenever a "writer" 
is around. In this case we use the "write" side for resizing mem_tails 
and the "read" side when mem_tails needs to stay stable. Which is why 
most of the time we have a read lock and then promote to a write lock 
when we need to reallocate.

The annotations are to clean up a minor deficiency in Helgrind: for 
whatever reason if you do writes under a read lock it reports races 
with the writes from under the write lock (in this case, 
__libdw_allocate and the realloc). I haven't dug deep enough to know 
exactly why it happens, just that it does and adding this H-B arc seems 
to fix the issue.

> 
>>  +#define THREAD_ID_UNSET ((size_t) -1)
>>  +static thread_local size_t thread_id = THREAD_ID_UNSET;
>>  +static atomic_size_t next_id = ATOMIC_VAR_INIT(0);
> 
> OK, but maybe use static __thread size_t thread_id as explained above?

Fine by me.

> 
>>  +struct libdw_memblock *
>>  +__libdw_alloc_tail (Dwarf *dbg)
>>  +{
>>  +  if (thread_id == THREAD_ID_UNSET)
>>  +    thread_id = atomic_fetch_add (&next_id, 1);
>>  +
>>  +  pthread_rwlock_rdlock (&dbg->mem_rwl);
>>  +  if (thread_id >= dbg->mem_stacks)
>>  +    {
>>  +      pthread_rwlock_unlock (&dbg->mem_rwl);
>>  +      pthread_rwlock_wrlock (&dbg->mem_rwl);
>>  +
>>  +      /* Another thread may have already reallocated. In theory 
>> using an
>>  +         atomic would be faster, but given that this only happens 
>> once per
>>  +         thread per Dwarf, some minor slowdown should be fine.  */
>>  +      if (thread_id >= dbg->mem_stacks)
>>  +        {
>>  +          dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
>>  +                                    * sizeof (struct 
>> libdw_memblock *));
>>  +          if (dbg->mem_tails == NULL)
>>  +            {
>>  +              pthread_rwlock_unlock (&dbg->mem_rwl);
>>  +              dbg->oom_handler();
>>  +            }
>>  +          for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
>>  +            dbg->mem_tails[i] = NULL;
>>  +          dbg->mem_stacks = thread_id + 1;
>>  +          ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
>>  +        }
>>  +
>>  +      pthread_rwlock_unlock (&dbg->mem_rwl);
>>  +      pthread_rwlock_rdlock (&dbg->mem_rwl);
>>  +    }
> 
> OK. So next_id only increases, so it might leak a bit without thread
> pools.
> 
> Might it make sense to have some __libdw_destroy_tail () function that
> could be called from dwarf_end that checks if this was the last Dwarf
> in use so we could reset next_id? It would only work in some cases, if
> there are multiple Dwarfs in use it probably is useless. I guess it is
> too much trickery for an odd corner case?
> 
> O, and I now think you would then also need something for dwarf_begin
> to reset any set thread_ids... bleah. So probably way too complicated.
> So lets not, unless you think this is actually simple.

Which is why I didn't want to do that.

The other option was to have a sort of free list for ids, but in that 
case the cleanup isn't great (sometime after all threads have 
completed... if you consider detached threads things get hairy). Plus 
it requires a fully concurrent stack or queue, which is a complicated 
data structure itself.

> 
>>  +  /* At this point, we have an entry in the tail array.  */
>>  +  ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
>>  +  struct libdw_memblock *result = dbg->mem_tails[thread_id];
>>  +  if (result == NULL)
>>  +    {
>>  +      result = malloc (dbg->mem_default_size);
>>  +      result->size = dbg->mem_default_size
>>  +                     - offsetof (struct libdw_memblock, mem);
>>  +      result->remaining = result->size;
>>  +      result->prev = NULL;
>>  +      dbg->mem_tails[thread_id] = result;
>>  +    }
>>  +  pthread_rwlock_unlock (&dbg->mem_rwl);
>>  +  return result;
>>  +}
> 
> OK.
> 
>>   void *
>>   __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
>>  @@ -52,10 +113,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize, 
>> size_t align)
>>     newp->size = size - offsetof (struct libdw_memblock, mem);
>>     newp->remaining = (uintptr_t) newp + size - (result + minsize);
>> 
>>  -  newp->prev = (struct libdw_memblock*)atomic_exchange_explicit(
>>  -      &dbg->mem_tail, (uintptr_t)newp, memory_order_relaxed);
>>  -  if (pthread_setspecific (dbg->mem_key, newp) != 0)
>>  -    dbg->oom_handler ();
>>  +  pthread_rwlock_rdlock (&dbg->mem_rwl);
>>  +  newp->prev = dbg->mem_tails[thread_id];
>>  +  dbg->mem_tails[thread_id] = newp;
>>  +  pthread_rwlock_unlock (&dbg->mem_rwl);
>> 
>>     return (void *) result;
>>   }
> 
> OK. Since this is only called after __libdw_alloc_tail you know that
> thread_id will be set.

Exactly.

> 
> Thanks,
> 
> Mark