This is the mail archive of the elfutils-devel@sourceware.org mailing list for the elfutils project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH 2/2] libdw: Rewrite the memory handler to be more robust.

From: Jonathon Anderson <jma14 at rice dot edu>
To: Mark Wielaard <mark at klomp dot org>
Cc: elfutils-devel at sourceware dot org
Date: Thu, 07 Nov 2019 12:40:30 -0600
Subject: Re: [PATCH 2/2] libdw: Rewrite the memory handler to be more robust.
References: <1572380520.19948.0@rice.edu> <20191029211437.3268-1-mark@klomp.org> <20191029211437.3268-2-mark@klomp.org> <93a4d8983b6ec43c09c6c3b3f6ed8d358321bb9d.camel@klomp.org>



On Thu, Nov 7, 2019 at 18:20, Mark Wielaard <mark@klomp.org> wrote:

Hi Jonathan,

On Tue, 2019-10-29 at 22:14 +0100, Mark Wielaard wrote:

 Pthread's thread-local variables are highly limited, which makes
 it difficult to use many Dwarfs. This replaces that with a
 less efficient (or elegant) but more robust method.


Thanks, it looks good (similar to a variant I already reviewed
earlier). Some small comments below.

I haven't benchmarked this version. Did you do any?

I haven't benchmarked this version, but I did benchmark the equivalentearlier version (this version is almost quite literally a rebase of theother). I don't have the exact results on hand, what I remember is thatthe pthread_key method was faster (and handled the many-thread casebetter), by maybe a factor of 1.5x-2x in parallel. In serial theoverhead was minimal (just an extra pointer indirection on allocations).

The fastest (I think) would be the "cloned" Dwarf idea, but thatrequires an overhaul.

 diff --git a/lib/atomics.h b/lib/atomics.h
 index ffd12f87..e3773759 100644
 --- a/lib/atomics.h
 +++ b/lib/atomics.h
 @@ -31,7 +31,9 @@
  #if HAVE_STDATOMIC_H
  /* If possible, use the compiler's preferred atomics.  */
  # include <stdatomic.h>
 +# include <threads.h>
  #else

/* Otherwise, try to use the builtins provided by this compiler.*/

  # include "stdatomic-fbsd.h"
 +# define thread_local __thread
  #endif /* HAVE_STDATOMIC_H */


Do we really need this?
We already use __thread unconditionally in the rest of the code.
The usage of threads.h seems to imply we actually want C11
_Thread_local. Is that what you really want, or can we just use
__thread in libdw_alloc.c for thread_id?

 diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
 index 8d137414..eadff13b 100644
 --- a/libdw/dwarf_begin_elf.c
 +++ b/libdw/dwarf_begin_elf.c

@@ -418,13 +418,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd,Elf_Scn *scngrp)

       actual allocation.  */
    result->mem_default_size = mem_default_size;
    result->oom_handler = __libdw_oom;
 -  if (pthread_key_create (&result->mem_key, NULL) != 0)
 +  if (pthread_rwlock_init(&result->mem_rwl, NULL) != 0)
      {
        free (result);

- __libdw_seterrno (DWARF_E_NOMEM); /* no memory or maxpthread keys. */

 +      __libdw_seterrno (DWARF_E_NOMEM); /* no memory.  */
        return NULL;
      }
 -  atomic_init (&result->mem_tail, (uintptr_t)NULL);
 +  result->mem_stacks = 0;
 +  result->mem_tails = NULL;

    if (cmd == DWARF_C_READ || cmd == DWARF_C_RDWR)
      {

OK.

 diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
 index a2e94436..3fd2836d 100644
 --- a/libdw/dwarf_end.c
 +++ b/libdw/dwarf_end.c
 @@ -95,17 +95,19 @@ dwarf_end (Dwarf *dwarf)
        tdestroy (dwarf->split_tree, noop_free);

        /* Free the internally allocated memory.  */
 -      struct libdw_memblock *memp;
 -      memp = (struct libdw_memblock *) (atomic_load_explicit
 -					(&dwarf->mem_tail,
 -					 memory_order_relaxed));
 -      while (memp != NULL)
 -	{
 -	  struct libdw_memblock *prevp = memp->prev;
 -	  free (memp);
 -	  memp = prevp;
 -	}
 -      pthread_key_delete (dwarf->mem_key);
 +      for (size_t i = 0; i < dwarf->mem_stacks; i++)
 +        {
 +          struct libdw_memblock *memp = dwarf->mem_tails[i];
 +          while (memp != NULL)
 +	    {
 +	      struct libdw_memblock *prevp = memp->prev;
 +	      free (memp);
 +	      memp = prevp;
 +	    }
 +        }
 +      if (dwarf->mem_tails != NULL)
 +        free (dwarf->mem_tails);
 +      pthread_rwlock_destroy (&dwarf->mem_rwl);

        /* Free the pubnames helper structure.  */
        free (dwarf->pubnames_sets);

OK. Might it be an idea to have some call to reset next_id (seebelow)?

 diff --git a/libdw/libdwP.h b/libdw/libdwP.h
 index ad2599eb..3e1ef59b 100644
 --- a/libdw/libdwP.h
 +++ b/libdw/libdwP.h
 @@ -149,17 +149,6 @@ enum

  #include "dwarf_sig8_hash.h"
-/* Structure for internal memory handling. This is basically asimplified- reimplementation of obstacks. Unfortunately the standardobstack
 -   implementation is not usable in libraries.  */
 -struct libdw_memblock
 -{
 -  size_t size;
 -  size_t remaining;
 -  struct libdw_memblock *prev;
 -  char mem[0];
 -};
 -
  /* This is the structure representing the debugging state.  */
  struct Dwarf
  {
 @@ -231,11 +220,22 @@ struct Dwarf
/* Similar for addrx/constx, which will come from .debug_addrsection. */
    struct Dwarf_CU *fake_addr_cu;
- /* Internal memory handling. Each thread allocates separatelyand only- allocates from its own blocks, while all the blocks arepushed atomically
 -     onto a unified stack for easy deallocation.  */
 -  pthread_key_t mem_key;
 -  atomic_uintptr_t mem_tail;
+ /* Supporting lock for internal memory handling. Ensuresthreads that have+ an entry in the mem_tails array are not disturbed by newthreads doing
 +     allocations for this Dwarf.  */
 +  pthread_rwlock_t mem_rwl;
 +
+ /* Internal memory handling. This is basically a simplifiedthread-local+ reimplementation of obstacks. Unfortunately the standardobstack
 +     implementation is not usable in libraries.  */
 +  size_t mem_stacks;
 +  struct libdw_memblock
 +  {
 +    size_t size;
 +    size_t remaining;
 +    struct libdw_memblock *prev;
 +    char mem[0];
 +  } **mem_tails;

    /* Default size of allocated memory blocks.  */
    size_t mem_default_size;

OK.

 @@ -578,34 +578,31 @@ libdw_valid_user_form (int form)
  extern void __libdw_seterrno (int value) internal_function;
-/* Memory handling, the easy parts. This macro does not do norneed to do any
 -   locking for proper concurrent operation.  */
 +/* Memory handling, the easy parts.  */
  #define libdw_alloc(dbg, type, tsize, cnt) \
- ({ struct libdw_memblock *_tail = pthread_getspecific(dbg->mem_key); \
 -     size_t _req = (tsize) * (cnt);					      \
 -     type *_result;							      \
 -     if (unlikely (_tail == NULL))					      \
- _result = (type *) __libdw_allocate (dbg, _req, __alignof(type)); \+ ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg);\
 +     size_t _required = (tsize) * (cnt);				      \
+ type *_result = (type *) (_tail->mem + (_tail->size -_tail->remaining));\
 +     size_t _padding = ((__alignof (type)				      \
 +			 - ((uintptr_t) _result & (__alignof (type) - 1)))    \
 +			& (__alignof (type) - 1));			      \
+ if (unlikely (_tail->remaining < _required + _padding))\+ _result = (type *) __libdw_allocate (dbg, _required,__alignof (type));\
       else								      \
         {								      \
- _result = (type *) (_tail->mem + (_tail->size -_tail->remaining)); \
 -	 size_t _padding = ((__alignof (type)				      \
 -			    - ((uintptr_t) _result & (__alignof (type) - 1))) \
 -			       & (__alignof (type) - 1));		      \
 -	 if (unlikely (_tail->remaining < _req + _padding))		      \
- _result = (type *) __libdw_allocate (dbg, _req, __alignof(type)); \
 -	 else								      \
 -	   {								      \
 -	     _req += _padding;						      \
 -	     _result = (type *) ((char *) _result + _padding);		      \
 -	     _tail->remaining -= _req;					      \
 -	   }								      \
 +	 _required += _padding;						      \
 +	 _result = (type *) ((char *) _result + _padding);		      \
 +	 _tail->remaining -= _required;					      \
         }								      \
       _result; })


OK. Maybe add a comment that __libdw_alloc_tail makes sure that we get
a thread local libdw_memblock to work with.

  #define libdw_typed_alloc(dbg, type) \
    libdw_alloc (dbg, type, sizeof (type), 1)

 +/* Callback to choose a thread-local memory allocation stack.  */
 +extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
 +     __nonnull_attribute__ (1);
 +
  /* Callback to allocate more.  */

extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_talign)

       __attribute__ ((__malloc__)) __nonnull_attribute__ (1);


O. There is the comment already :)

 diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
 index f2e74d18..86ca7032 100644
 --- a/libdw/libdw_alloc.c
 +++ b/libdw/libdw_alloc.c
 @@ -35,7 +35,68 @@
  #include <stdlib.h>
  #include "libdwP.h"
  #include "system.h"
 +#include "atomics.h"
 +#if USE_VG_ANNOTATIONS == 1
 +#include <helgrind.h>
 +#include <drd.h>


I think if you include helgrind.h you won't get the drd.h
ANNOTATE_HAPPENS_BEFORE/AFTER. So do you also need to include drd.h?

 +#else
 +#define ANNOTATE_HAPPENS_BEFORE(X)
 +#define ANNOTATE_HAPPENS_AFTER(X)
 +#endif


Could you explain the usage of the happens_before/after annotations in
this code. I must admit that I don't fully understand why/how it works
in this case. Specifically since realloc might change the address that
mem_tails points to.

 +#define THREAD_ID_UNSET ((size_t) -1)
 +static thread_local size_t thread_id = THREAD_ID_UNSET;
 +static atomic_size_t next_id = ATOMIC_VAR_INIT(0);


OK, but maybe use static __thread size_t thread_id as explained above?

 +struct libdw_memblock *
 +__libdw_alloc_tail (Dwarf *dbg)
 +{
 +  if (thread_id == THREAD_ID_UNSET)
 +    thread_id = atomic_fetch_add (&next_id, 1);
 +
 +  pthread_rwlock_rdlock (&dbg->mem_rwl);
 +  if (thread_id >= dbg->mem_stacks)
 +    {
 +      pthread_rwlock_unlock (&dbg->mem_rwl);
 +      pthread_rwlock_wrlock (&dbg->mem_rwl);
 +

+ /* Another thread may have already reallocated. In theoryusing an+ atomic would be faster, but given that this only happensonce per

 +         thread per Dwarf, some minor slowdown should be fine.  */
 +      if (thread_id >= dbg->mem_stacks)
 +        {
 +          dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)

+ * sizeof (structlibdw_memblock *));

 +          if (dbg->mem_tails == NULL)
 +            {
 +              pthread_rwlock_unlock (&dbg->mem_rwl);
 +              dbg->oom_handler();
 +            }
 +          for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
 +            dbg->mem_tails[i] = NULL;
 +          dbg->mem_stacks = thread_id + 1;
 +          ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
 +        }
 +
 +      pthread_rwlock_unlock (&dbg->mem_rwl);
 +      pthread_rwlock_rdlock (&dbg->mem_rwl);
 +    }


OK. So next_id only increases, so it might leak a bit without thread
pools.

Might it make sense to have some __libdw_destroy_tail () function that
could be called from dwarf_end that checks if this was the last Dwarf
in use so we could reset next_id? It would only work in some cases, if
there are multiple Dwarfs in use it probably is useless. I guess it is
too much trickery for an odd corner case?

O, and I now think you would then also need something for dwarf_begin
to reset any set thread_ids... bleah. So probably way too complicated.
So lets not, unless you think this is actually simple.

 +  /* At this point, we have an entry in the tail array.  */
 +  ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
 +  struct libdw_memblock *result = dbg->mem_tails[thread_id];
 +  if (result == NULL)
 +    {
 +      result = malloc (dbg->mem_default_size);
 +      result->size = dbg->mem_default_size
 +                     - offsetof (struct libdw_memblock, mem);
 +      result->remaining = result->size;
 +      result->prev = NULL;
 +      dbg->mem_tails[thread_id] = result;
 +    }
 +  pthread_rwlock_unlock (&dbg->mem_rwl);
 +  return result;
 +}

OK.

  void *
  __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)

@@ -52,10 +113,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize,size_t align)

    newp->size = size - offsetof (struct libdw_memblock, mem);
    newp->remaining = (uintptr_t) newp + size - (result + minsize);

 -  newp->prev = (struct libdw_memblock*)atomic_exchange_explicit(
 -      &dbg->mem_tail, (uintptr_t)newp, memory_order_relaxed);
 -  if (pthread_setspecific (dbg->mem_key, newp) != 0)
 -    dbg->oom_handler ();
 +  pthread_rwlock_rdlock (&dbg->mem_rwl);
 +  newp->prev = dbg->mem_tails[thread_id];
 +  dbg->mem_tails[thread_id] = newp;
 +  pthread_rwlock_unlock (&dbg->mem_rwl);

    return (void *) result;
  }


OK. Since this is only called after __libdw_alloc_tail you know that
thread_id will be set.

Thanks,

Mark

Follow-Ups:
- Re: [PATCH 2/2] libdw: Rewrite the memory handler to be more robust.
  - From: Mark Wielaard

References:
- Re: [PATCH 0/2] libdw: Rewrite the memory handler to be more robust
  - From: Jonathon Anderson
- [PATCH 1/2] Add configure options for Valgrind annotations.
  - From: Mark Wielaard
- [PATCH 2/2] libdw: Rewrite the memory handler to be more robust.
  - From: Mark Wielaard
- Re: [PATCH 2/2] libdw: Rewrite the memory handler to be more robust.
  - From: Mark Wielaard

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]