This is the mail archive of the
elfutils-devel@sourceware.org
mailing list for the elfutils project.
Re: [PATCH 2/2] libdw: Rewrite the memory handler to be more robust.
Hi Jonathan,
On Tue, 2019-10-29 at 22:14 +0100, Mark Wielaard wrote:
> Pthread's thread-local variables are highly limited, which makes
> it difficult to use many Dwarfs. This replaces that with a
> less efficient (or elegant) but more robust method.
Thanks, it looks good (similar to a variant I already reviewed
earlier). Some small comments below.
I haven't benchmarked this version. Did you do any?
> diff --git a/lib/atomics.h b/lib/atomics.h
> index ffd12f87..e3773759 100644
> --- a/lib/atomics.h
> +++ b/lib/atomics.h
> @@ -31,7 +31,9 @@
> #if HAVE_STDATOMIC_H
> /* If possible, use the compiler's preferred atomics. */
> # include <stdatomic.h>
> +# include <threads.h>
> #else
> /* Otherwise, try to use the builtins provided by this compiler. */
> # include "stdatomic-fbsd.h"
> +# define thread_local __thread
> #endif /* HAVE_STDATOMIC_H */
Do we really need this?
We already use __thread unconditionally in the rest of the code.
The usage of threads.h seems to imply we actually want C11
_Thread_local. Is that what you really want, or can we just use
__thread in libdw_alloc.c for thread_id?
> diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
> index 8d137414..eadff13b 100644
> --- a/libdw/dwarf_begin_elf.c
> +++ b/libdw/dwarf_begin_elf.c
> @@ -418,13 +418,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd, Elf_Scn *scngrp)
> actual allocation. */
> result->mem_default_size = mem_default_size;
> result->oom_handler = __libdw_oom;
> - if (pthread_key_create (&result->mem_key, NULL) != 0)
> + if (pthread_rwlock_init(&result->mem_rwl, NULL) != 0)
> {
> free (result);
> - __libdw_seterrno (DWARF_E_NOMEM); /* no memory or max pthread keys. */
> + __libdw_seterrno (DWARF_E_NOMEM); /* no memory. */
> return NULL;
> }
> - atomic_init (&result->mem_tail, (uintptr_t)NULL);
> + result->mem_stacks = 0;
> + result->mem_tails = NULL;
>
> if (cmd == DWARF_C_READ || cmd == DWARF_C_RDWR)
> {
OK.
> diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
> index a2e94436..3fd2836d 100644
> --- a/libdw/dwarf_end.c
> +++ b/libdw/dwarf_end.c
> @@ -95,17 +95,19 @@ dwarf_end (Dwarf *dwarf)
> tdestroy (dwarf->split_tree, noop_free);
>
> /* Free the internally allocated memory. */
> - struct libdw_memblock *memp;
> - memp = (struct libdw_memblock *) (atomic_load_explicit
> - (&dwarf->mem_tail,
> - memory_order_relaxed));
> - while (memp != NULL)
> - {
> - struct libdw_memblock *prevp = memp->prev;
> - free (memp);
> - memp = prevp;
> - }
> - pthread_key_delete (dwarf->mem_key);
> + for (size_t i = 0; i < dwarf->mem_stacks; i++)
> + {
> + struct libdw_memblock *memp = dwarf->mem_tails[i];
> + while (memp != NULL)
> + {
> + struct libdw_memblock *prevp = memp->prev;
> + free (memp);
> + memp = prevp;
> + }
> + }
> + if (dwarf->mem_tails != NULL)
> + free (dwarf->mem_tails);
> + pthread_rwlock_destroy (&dwarf->mem_rwl);
>
> /* Free the pubnames helper structure. */
> free (dwarf->pubnames_sets);
OK. Might it be an idea to have some call to reset next_id (see below)?
> diff --git a/libdw/libdwP.h b/libdw/libdwP.h
> index ad2599eb..3e1ef59b 100644
> --- a/libdw/libdwP.h
> +++ b/libdw/libdwP.h
> @@ -149,17 +149,6 @@ enum
>
> #include "dwarf_sig8_hash.h"
>
> -/* Structure for internal memory handling. This is basically a simplified
> - reimplementation of obstacks. Unfortunately the standard obstack
> - implementation is not usable in libraries. */
> -struct libdw_memblock
> -{
> - size_t size;
> - size_t remaining;
> - struct libdw_memblock *prev;
> - char mem[0];
> -};
> -
> /* This is the structure representing the debugging state. */
> struct Dwarf
> {
> @@ -231,11 +220,22 @@ struct Dwarf
> /* Similar for addrx/constx, which will come from .debug_addr section. */
> struct Dwarf_CU *fake_addr_cu;
>
> - /* Internal memory handling. Each thread allocates separately and only
> - allocates from its own blocks, while all the blocks are pushed atomically
> - onto a unified stack for easy deallocation. */
> - pthread_key_t mem_key;
> - atomic_uintptr_t mem_tail;
> + /* Supporting lock for internal memory handling. Ensures threads that have
> + an entry in the mem_tails array are not disturbed by new threads doing
> + allocations for this Dwarf. */
> + pthread_rwlock_t mem_rwl;
> +
> + /* Internal memory handling. This is basically a simplified thread-local
> + reimplementation of obstacks. Unfortunately the standard obstack
> + implementation is not usable in libraries. */
> + size_t mem_stacks;
> + struct libdw_memblock
> + {
> + size_t size;
> + size_t remaining;
> + struct libdw_memblock *prev;
> + char mem[0];
> + } **mem_tails;
>
> /* Default size of allocated memory blocks. */
> size_t mem_default_size;
OK.
> @@ -578,34 +578,31 @@ libdw_valid_user_form (int form)
> extern void __libdw_seterrno (int value) internal_function;
>
>
> -/* Memory handling, the easy parts. This macro does not do nor need to do any
> - locking for proper concurrent operation. */
> +/* Memory handling, the easy parts. */
> #define libdw_alloc(dbg, type, tsize, cnt) \
> - ({ struct libdw_memblock *_tail = pthread_getspecific (dbg->mem_key); \
> - size_t _req = (tsize) * (cnt); \
> - type *_result; \
> - if (unlikely (_tail == NULL)) \
> - _result = (type *) __libdw_allocate (dbg, _req, __alignof (type)); \
> + ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg); \
> + size_t _required = (tsize) * (cnt); \
> + type *_result = (type *) (_tail->mem + (_tail->size - _tail->remaining));\
> + size_t _padding = ((__alignof (type) \
> + - ((uintptr_t) _result & (__alignof (type) - 1))) \
> + & (__alignof (type) - 1)); \
> + if (unlikely (_tail->remaining < _required + _padding)) \
> + _result = (type *) __libdw_allocate (dbg, _required, __alignof (type));\
> else \
> { \
> - _result = (type *) (_tail->mem + (_tail->size - _tail->remaining)); \
> - size_t _padding = ((__alignof (type) \
> - - ((uintptr_t) _result & (__alignof (type) - 1))) \
> - & (__alignof (type) - 1)); \
> - if (unlikely (_tail->remaining < _req + _padding)) \
> - _result = (type *) __libdw_allocate (dbg, _req, __alignof (type)); \
> - else \
> - { \
> - _req += _padding; \
> - _result = (type *) ((char *) _result + _padding); \
> - _tail->remaining -= _req; \
> - } \
> + _required += _padding; \
> + _result = (type *) ((char *) _result + _padding); \
> + _tail->remaining -= _required; \
> } \
> _result; })
OK. Maybe add a comment that __libdw_alloc_tail makes sure that we get
a thread local libdw_memblock to work with.
> #define libdw_typed_alloc(dbg, type) \
> libdw_alloc (dbg, type, sizeof (type), 1)
>
> +/* Callback to choose a thread-local memory allocation stack. */
> +extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
> + __nonnull_attribute__ (1);
> +
> /* Callback to allocate more. */
> extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
> __attribute__ ((__malloc__)) __nonnull_attribute__ (1);
O. There is the comment already :)
> diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
> index f2e74d18..86ca7032 100644
> --- a/libdw/libdw_alloc.c
> +++ b/libdw/libdw_alloc.c
> @@ -35,7 +35,68 @@
> #include <stdlib.h>
> #include "libdwP.h"
> #include "system.h"
> +#include "atomics.h"
> +#if USE_VG_ANNOTATIONS == 1
> +#include <helgrind.h>
> +#include <drd.h>
I think if you include helgrind.h you won't get the drd.h
ANNOTATE_HAPPENS_BEFORE/AFTER. So do you also need to include drd.h?
> +#else
> +#define ANNOTATE_HAPPENS_BEFORE(X)
> +#define ANNOTATE_HAPPENS_AFTER(X)
> +#endif
Could you explain the usage of the happens_before/after annotations in
this code. I must admit that I don't fully understand why/how it works
in this case. Specifically since realloc might change the address that
mem_tails points to.
> +#define THREAD_ID_UNSET ((size_t) -1)
> +static thread_local size_t thread_id = THREAD_ID_UNSET;
> +static atomic_size_t next_id = ATOMIC_VAR_INIT(0);
OK, but maybe use static __thread size_t thread_id as explained above?
> +struct libdw_memblock *
> +__libdw_alloc_tail (Dwarf *dbg)
> +{
> + if (thread_id == THREAD_ID_UNSET)
> + thread_id = atomic_fetch_add (&next_id, 1);
> +
> + pthread_rwlock_rdlock (&dbg->mem_rwl);
> + if (thread_id >= dbg->mem_stacks)
> + {
> + pthread_rwlock_unlock (&dbg->mem_rwl);
> + pthread_rwlock_wrlock (&dbg->mem_rwl);
> +
> + /* Another thread may have already reallocated. In theory using an
> + atomic would be faster, but given that this only happens once per
> + thread per Dwarf, some minor slowdown should be fine. */
> + if (thread_id >= dbg->mem_stacks)
> + {
> + dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
> + * sizeof (struct libdw_memblock *));
> + if (dbg->mem_tails == NULL)
> + {
> + pthread_rwlock_unlock (&dbg->mem_rwl);
> + dbg->oom_handler();
> + }
> + for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
> + dbg->mem_tails[i] = NULL;
> + dbg->mem_stacks = thread_id + 1;
> + ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
> + }
> +
> + pthread_rwlock_unlock (&dbg->mem_rwl);
> + pthread_rwlock_rdlock (&dbg->mem_rwl);
> + }
OK. So next_id only increases, so it might leak a bit without thread
pools.
Might it make sense to have some __libdw_destroy_tail () function that
could be called from dwarf_end that checks if this was the last Dwarf
in use so we could reset next_id? It would only work in some cases, if
there are multiple Dwarfs in use it probably is useless. I guess it is
too much trickery for an odd corner case?
O, and I now think you would then also need something for dwarf_begin
to reset any set thread_ids... bleah. So probably way too complicated.
So lets not, unless you think this is actually simple.
> + /* At this point, we have an entry in the tail array. */
> + ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
> + struct libdw_memblock *result = dbg->mem_tails[thread_id];
> + if (result == NULL)
> + {
> + result = malloc (dbg->mem_default_size);
> + result->size = dbg->mem_default_size
> + - offsetof (struct libdw_memblock, mem);
> + result->remaining = result->size;
> + result->prev = NULL;
> + dbg->mem_tails[thread_id] = result;
> + }
> + pthread_rwlock_unlock (&dbg->mem_rwl);
> + return result;
> +}
OK.
> void *
> __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
> @@ -52,10 +113,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
> newp->size = size - offsetof (struct libdw_memblock, mem);
> newp->remaining = (uintptr_t) newp + size - (result + minsize);
>
> - newp->prev = (struct libdw_memblock*)atomic_exchange_explicit(
> - &dbg->mem_tail, (uintptr_t)newp, memory_order_relaxed);
> - if (pthread_setspecific (dbg->mem_key, newp) != 0)
> - dbg->oom_handler ();
> + pthread_rwlock_rdlock (&dbg->mem_rwl);
> + newp->prev = dbg->mem_tails[thread_id];
> + dbg->mem_tails[thread_id] = newp;
> + pthread_rwlock_unlock (&dbg->mem_rwl);
>
> return (void *) result;
> }
OK. Since this is only called after __libdw_alloc_tail you know that
thread_id will be set.
Thanks,
Mark