[PATCH 2/2] libdw: Rewrite the memory handler to be more robust.
Jonathon Anderson
jma14@rice.edu
Thu Nov 7 18:40:00 GMT 2019
On Thu, Nov 7, 2019 at 18:20, Mark Wielaard <mark@klomp.org> wrote:
> Hi Jonathan,
>
> On Tue, 2019-10-29 at 22:14 +0100, Mark Wielaard wrote:
>> Pthread's thread-local variables are highly limited, which makes
>> it difficult to use many Dwarfs. This replaces that with a
>> less efficient (or elegant) but more robust method.
>
> Thanks, it looks good (similar to a variant I already reviewed
> earlier). Some small comments below.
>
> I haven't benchmarked this version. Did you do any?
I haven't benchmarked this version, but I did benchmark the equivalent
earlier version (this version is almost quite literally a rebase of the
other). I don't have the exact results on hand, what I remember is that
the pthread_key method was faster (and handled the many-thread case
better), by maybe a factor of 1.5x-2x in parallel. In serial the
overhead was minimal (just an extra pointer indirection on allocations).
The fastest (I think) would be the "cloned" Dwarf idea, but that
requires an overhaul.
>
>> diff --git a/lib/atomics.h b/lib/atomics.h
>> index ffd12f87..e3773759 100644
>> --- a/lib/atomics.h
>> +++ b/lib/atomics.h
>> @@ -31,7 +31,9 @@
>> #if HAVE_STDATOMIC_H
>> /* If possible, use the compiler's preferred atomics. */
>> # include <stdatomic.h>
>> +# include <threads.h>
>> #else
>> /* Otherwise, try to use the builtins provided by this compiler.
>> */
>> # include "stdatomic-fbsd.h"
>> +# define thread_local __thread
>> #endif /* HAVE_STDATOMIC_H */
>
> Do we really need this?
> We already use __thread unconditionally in the rest of the code.
> The usage of threads.h seems to imply we actually want C11
> _Thread_local. Is that what you really want, or can we just use
> __thread in libdw_alloc.c for thread_id?
>
>> diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
>> index 8d137414..eadff13b 100644
>> --- a/libdw/dwarf_begin_elf.c
>> +++ b/libdw/dwarf_begin_elf.c
>> @@ -418,13 +418,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd,
>> Elf_Scn *scngrp)
>> actual allocation. */
>> result->mem_default_size = mem_default_size;
>> result->oom_handler = __libdw_oom;
>> - if (pthread_key_create (&result->mem_key, NULL) != 0)
>> + if (pthread_rwlock_init(&result->mem_rwl, NULL) != 0)
>> {
>> free (result);
>> - __libdw_seterrno (DWARF_E_NOMEM); /* no memory or max
>> pthread keys. */
>> + __libdw_seterrno (DWARF_E_NOMEM); /* no memory. */
>> return NULL;
>> }
>> - atomic_init (&result->mem_tail, (uintptr_t)NULL);
>> + result->mem_stacks = 0;
>> + result->mem_tails = NULL;
>>
>> if (cmd == DWARF_C_READ || cmd == DWARF_C_RDWR)
>> {
>
> OK.
>
>> diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
>> index a2e94436..3fd2836d 100644
>> --- a/libdw/dwarf_end.c
>> +++ b/libdw/dwarf_end.c
>> @@ -95,17 +95,19 @@ dwarf_end (Dwarf *dwarf)
>> tdestroy (dwarf->split_tree, noop_free);
>>
>> /* Free the internally allocated memory. */
>> - struct libdw_memblock *memp;
>> - memp = (struct libdw_memblock *) (atomic_load_explicit
>> - (&dwarf->mem_tail,
>> - memory_order_relaxed));
>> - while (memp != NULL)
>> - {
>> - struct libdw_memblock *prevp = memp->prev;
>> - free (memp);
>> - memp = prevp;
>> - }
>> - pthread_key_delete (dwarf->mem_key);
>> + for (size_t i = 0; i < dwarf->mem_stacks; i++)
>> + {
>> + struct libdw_memblock *memp = dwarf->mem_tails[i];
>> + while (memp != NULL)
>> + {
>> + struct libdw_memblock *prevp = memp->prev;
>> + free (memp);
>> + memp = prevp;
>> + }
>> + }
>> + if (dwarf->mem_tails != NULL)
>> + free (dwarf->mem_tails);
>> + pthread_rwlock_destroy (&dwarf->mem_rwl);
>>
>> /* Free the pubnames helper structure. */
>> free (dwarf->pubnames_sets);
>
> OK. Might it be an idea to have some call to reset next_id (see
> below)?
>
>> diff --git a/libdw/libdwP.h b/libdw/libdwP.h
>> index ad2599eb..3e1ef59b 100644
>> --- a/libdw/libdwP.h
>> +++ b/libdw/libdwP.h
>> @@ -149,17 +149,6 @@ enum
>>
>> #include "dwarf_sig8_hash.h"
>>
>> -/* Structure for internal memory handling. This is basically a
>> simplified
>> - reimplementation of obstacks. Unfortunately the standard
>> obstack
>> - implementation is not usable in libraries. */
>> -struct libdw_memblock
>> -{
>> - size_t size;
>> - size_t remaining;
>> - struct libdw_memblock *prev;
>> - char mem[0];
>> -};
>> -
>> /* This is the structure representing the debugging state. */
>> struct Dwarf
>> {
>> @@ -231,11 +220,22 @@ struct Dwarf
>> /* Similar for addrx/constx, which will come from .debug_addr
>> section. */
>> struct Dwarf_CU *fake_addr_cu;
>>
>> - /* Internal memory handling. Each thread allocates separately
>> and only
>> - allocates from its own blocks, while all the blocks are
>> pushed atomically
>> - onto a unified stack for easy deallocation. */
>> - pthread_key_t mem_key;
>> - atomic_uintptr_t mem_tail;
>> + /* Supporting lock for internal memory handling. Ensures
>> threads that have
>> + an entry in the mem_tails array are not disturbed by new
>> threads doing
>> + allocations for this Dwarf. */
>> + pthread_rwlock_t mem_rwl;
>> +
>> + /* Internal memory handling. This is basically a simplified
>> thread-local
>> + reimplementation of obstacks. Unfortunately the standard
>> obstack
>> + implementation is not usable in libraries. */
>> + size_t mem_stacks;
>> + struct libdw_memblock
>> + {
>> + size_t size;
>> + size_t remaining;
>> + struct libdw_memblock *prev;
>> + char mem[0];
>> + } **mem_tails;
>>
>> /* Default size of allocated memory blocks. */
>> size_t mem_default_size;
>
> OK.
>
>> @@ -578,34 +578,31 @@ libdw_valid_user_form (int form)
>> extern void __libdw_seterrno (int value) internal_function;
>>
>>
>> -/* Memory handling, the easy parts. This macro does not do nor
>> need to do any
>> - locking for proper concurrent operation. */
>> +/* Memory handling, the easy parts. */
>> #define libdw_alloc(dbg, type, tsize, cnt) \
>> - ({ struct libdw_memblock *_tail = pthread_getspecific
>> (dbg->mem_key); \
>> - size_t _req = (tsize) * (cnt); \
>> - type *_result; \
>> - if (unlikely (_tail == NULL)) \
>> - _result = (type *) __libdw_allocate (dbg, _req, __alignof
>> (type)); \
>> + ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg);
>> \
>> + size_t _required = (tsize) * (cnt); \
>> + type *_result = (type *) (_tail->mem + (_tail->size -
>> _tail->remaining));\
>> + size_t _padding = ((__alignof (type) \
>> + - ((uintptr_t) _result & (__alignof (type) - 1))) \
>> + & (__alignof (type) - 1)); \
>> + if (unlikely (_tail->remaining < _required + _padding))
>> \
>> + _result = (type *) __libdw_allocate (dbg, _required,
>> __alignof (type));\
>> else \
>> { \
>> - _result = (type *) (_tail->mem + (_tail->size -
>> _tail->remaining)); \
>> - size_t _padding = ((__alignof (type) \
>> - - ((uintptr_t) _result & (__alignof (type) - 1))) \
>> - & (__alignof (type) - 1)); \
>> - if (unlikely (_tail->remaining < _req + _padding)) \
>> - _result = (type *) __libdw_allocate (dbg, _req, __alignof
>> (type)); \
>> - else \
>> - { \
>> - _req += _padding; \
>> - _result = (type *) ((char *) _result + _padding); \
>> - _tail->remaining -= _req; \
>> - } \
>> + _required += _padding; \
>> + _result = (type *) ((char *) _result + _padding); \
>> + _tail->remaining -= _required; \
>> } \
>> _result; })
>
> OK. Maybe add a comment that __libdw_alloc_tail makes sure that we get
> a thread local libdw_memblock to work with.
>
>> #define libdw_typed_alloc(dbg, type) \
>> libdw_alloc (dbg, type, sizeof (type), 1)
>>
>> +/* Callback to choose a thread-local memory allocation stack. */
>> +extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
>> + __nonnull_attribute__ (1);
>> +
>> /* Callback to allocate more. */
>> extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_t
>> align)
>> __attribute__ ((__malloc__)) __nonnull_attribute__ (1);
>
> O. There is the comment already :)
>
>> diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
>> index f2e74d18..86ca7032 100644
>> --- a/libdw/libdw_alloc.c
>> +++ b/libdw/libdw_alloc.c
>> @@ -35,7 +35,68 @@
>> #include <stdlib.h>
>> #include "libdwP.h"
>> #include "system.h"
>> +#include "atomics.h"
>> +#if USE_VG_ANNOTATIONS == 1
>> +#include <helgrind.h>
>> +#include <drd.h>
>
> I think if you include helgrind.h you won't get the drd.h
> ANNOTATE_HAPPENS_BEFORE/AFTER. So do you also need to include drd.h?
>
>> +#else
>> +#define ANNOTATE_HAPPENS_BEFORE(X)
>> +#define ANNOTATE_HAPPENS_AFTER(X)
>> +#endif
>
> Could you explain the usage of the happens_before/after annotations in
> this code. I must admit that I don't fully understand why/how it works
> in this case. Specifically since realloc might change the address that
> mem_tails points to.
>
>> +#define THREAD_ID_UNSET ((size_t) -1)
>> +static thread_local size_t thread_id = THREAD_ID_UNSET;
>> +static atomic_size_t next_id = ATOMIC_VAR_INIT(0);
>
> OK, but maybe use static __thread size_t thread_id as explained above?
>
>> +struct libdw_memblock *
>> +__libdw_alloc_tail (Dwarf *dbg)
>> +{
>> + if (thread_id == THREAD_ID_UNSET)
>> + thread_id = atomic_fetch_add (&next_id, 1);
>> +
>> + pthread_rwlock_rdlock (&dbg->mem_rwl);
>> + if (thread_id >= dbg->mem_stacks)
>> + {
>> + pthread_rwlock_unlock (&dbg->mem_rwl);
>> + pthread_rwlock_wrlock (&dbg->mem_rwl);
>> +
>> + /* Another thread may have already reallocated. In theory
>> using an
>> + atomic would be faster, but given that this only happens
>> once per
>> + thread per Dwarf, some minor slowdown should be fine. */
>> + if (thread_id >= dbg->mem_stacks)
>> + {
>> + dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
>> + * sizeof (struct
>> libdw_memblock *));
>> + if (dbg->mem_tails == NULL)
>> + {
>> + pthread_rwlock_unlock (&dbg->mem_rwl);
>> + dbg->oom_handler();
>> + }
>> + for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
>> + dbg->mem_tails[i] = NULL;
>> + dbg->mem_stacks = thread_id + 1;
>> + ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
>> + }
>> +
>> + pthread_rwlock_unlock (&dbg->mem_rwl);
>> + pthread_rwlock_rdlock (&dbg->mem_rwl);
>> + }
>
> OK. So next_id only increases, so it might leak a bit without thread
> pools.
>
> Might it make sense to have some __libdw_destroy_tail () function that
> could be called from dwarf_end that checks if this was the last Dwarf
> in use so we could reset next_id? It would only work in some cases, if
> there are multiple Dwarfs in use it probably is useless. I guess it is
> too much trickery for an odd corner case?
>
> O, and I now think you would then also need something for dwarf_begin
> to reset any set thread_ids... bleah. So probably way too complicated.
> So lets not, unless you think this is actually simple.
>
>> + /* At this point, we have an entry in the tail array. */
>> + ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
>> + struct libdw_memblock *result = dbg->mem_tails[thread_id];
>> + if (result == NULL)
>> + {
>> + result = malloc (dbg->mem_default_size);
>> + result->size = dbg->mem_default_size
>> + - offsetof (struct libdw_memblock, mem);
>> + result->remaining = result->size;
>> + result->prev = NULL;
>> + dbg->mem_tails[thread_id] = result;
>> + }
>> + pthread_rwlock_unlock (&dbg->mem_rwl);
>> + return result;
>> +}
>
> OK.
>
>> void *
>> __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
>> @@ -52,10 +113,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize,
>> size_t align)
>> newp->size = size - offsetof (struct libdw_memblock, mem);
>> newp->remaining = (uintptr_t) newp + size - (result + minsize);
>>
>> - newp->prev = (struct libdw_memblock*)atomic_exchange_explicit(
>> - &dbg->mem_tail, (uintptr_t)newp, memory_order_relaxed);
>> - if (pthread_setspecific (dbg->mem_key, newp) != 0)
>> - dbg->oom_handler ();
>> + pthread_rwlock_rdlock (&dbg->mem_rwl);
>> + newp->prev = dbg->mem_tails[thread_id];
>> + dbg->mem_tails[thread_id] = newp;
>> + pthread_rwlock_unlock (&dbg->mem_rwl);
>>
>> return (void *) result;
>> }
>
> OK. Since this is only called after __libdw_alloc_tail you know that
> thread_id will be set.
>
> Thanks,
>
> Mark
More information about the Elfutils-devel
mailing list