This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH v6] Also use l_tls_dtor_count to decide on object unload (BZ #18657)


On 07/20/2015 01:31 PM, Siddhesh Poyarekar wrote:
> v6: Juggled around the test case code a bit so that it uses tst-tls-atexit.c
> with a couple of macros.
> 
> When an TLS destructor is registered, we set the DF_1_NODELETE flag to
> signal that the object should not be destroyed.  We then clear the
> DF_1_NODELETE flag when all destructors are called, which is wrong -
> the flag could have been set by other means too.
> 
> This patch replaces this use of the flag by using l_tls_dtor_count
> directly to determine whether it is safe to unload the object.  This
> change has the added advantage of eliminating the lock taking when
> calling the destructors, which could result in a deadlock.  The patch
> also fixes the test case tst-tls-atexit - it was making an invalid
> dlclose call, which would just return an error silently.
> 
> I have also added a detailed note on concurrency which also aims to
> justify why I chose the semantics I chose for accesses to
> l_tls_dtor_count.  Thanks to Torvald for his help in getting me
> started on this and (literally) teaching my how to approach the
> problem.
> 
> Change verified on x86_64; the test suite does not show any
> regressions due to the patch.

This version looks awesome. Thanks for the awesome work!

OK for 2.22 with suggestions (see below).

I had lots of fun coming up with orderings between the release and acquire
pairs and the mutex locks which I treated as release+acquire barriers
for the purpose of the analysis.

My own rough review notes for reference:

__cxa_thread_atexit_impl
        atomic_fetch_add_relaxed l_tls_dotr_count +1
        - dl_load_lock held.
                - Excludes any unloads so forbids __dl_close_worker
                - Does not forbid a thread being killed.
                        - Deferred cancel happens after lock release
                          so not concurrent with anything else.
                          And thread eventually calls __call_tls_dtors
			  with release barrier.
	- dl_load_lock is release+acquire barrier, which allows relaxed
	  ordering atomic_fetch_add to l_tls_dtor_count, because after
	  unlock you will either see consistent MAP or you won't see
	  MAP at all since load/stores can't leak past the unlock.

__call_tls_dtors
        atomic_fetch_add_release curr->map->l_tls_dtor_count -1
        - write release
        - keeps map deref before the decrement which means you never
          access map with l_tls_dtor_count set to 0.

__dl_close_worker
        Took dl_load_lock
        atomic_load_acquire l_tls_dtor_count
        - load acquire
        - either sees l_tls_dtor_count as 0 or positive
                - if zero, then we know map in __call_tls_dtors was
                  already derefed so we know it's safe to destroy map
                - if positive then we know map has not been dereffed yet
                  so we can't delete it. We don't know when atomic_fetch_add_release
                  will happen if ever.
        if l_tls_dtor_count == 0
                freed map.
        Released dl_load_lock


> ChangeLog:
> 
> 	[BZ #18657]
> 	* elf/dl-close.c (_dl_close_worker): Don't unload DSO if there
> 	are pending TLS destructor calls.
> 	* include/link.h (struct link_map): Add concurrency note for
> 	L_TLS_DTOR_COUNT.
> 	* stdlib/cxa_thread_atexit_impl.c (__cxa_thread_atexit_impl):
> 	Don't touch the link map flag.  Atomically increment
> 	l_tls_dtor_count.
> 	(__call_tls_dtors): Atomically decrement l_tls_dtor_count.
> 	Avoid taking the load lock and don't touch the link map flag.
> 	* stdlib/tst-tls-atexit-nodelete.c: New test case.
> 	* stdlib/Makefile (tests): Use it.
> 	* stdlib/tst-tls-atexit.c (do_test): dlopen
> 	tst-tls-atexit-lib.so again before dlclose.  Add conditionals
> 	to allow tst-tls-atexit-nodelete test case to use it.
> ---
>  elf/dl-close.c                   |  9 ++++-
>  include/link.h                   |  4 ++-
>  stdlib/Makefile                  |  5 ++-
>  stdlib/cxa_thread_atexit_impl.c  | 76 ++++++++++++++++++++++++++++++----------
>  stdlib/tst-tls-atexit-nodelete.c | 24 +++++++++++++
>  stdlib/tst-tls-atexit.c          | 60 +++++++++++++++++++++++--------
>  6 files changed, 143 insertions(+), 35 deletions(-)
>  create mode 100644 stdlib/tst-tls-atexit-nodelete.c
> 
> diff --git a/elf/dl-close.c b/elf/dl-close.c
> index 2104674..9105277 100644
> --- a/elf/dl-close.c
> +++ b/elf/dl-close.c
> @@ -153,7 +153,11 @@ _dl_close_worker (struct link_map *map, bool force)
>        maps[idx] = l;
>        ++idx;
>  
> -      /* Clear DF_1_NODELETE to force object deletion.  */
> +      /* Clear DF_1_NODELETE to force object deletion.  We don't need to touch
> +	 l_tls_dtor_count because forced object deletion only happens when an
> +	 error occurs during object load.  Destructor registration for TLS
> +	 non-POD objects should not have happened till then for this
> +	 object.  */

OK.

>        if (force)
>  	l->l_flags_1 &= ~DF_1_NODELETE;
>      }
> @@ -177,6 +181,9 @@ _dl_close_worker (struct link_map *map, bool force)
>        if (l->l_type == lt_loaded
>  	  && l->l_direct_opencount == 0
>  	  && (l->l_flags_1 & DF_1_NODELETE) == 0
> +	  /* See CONCURRENCY NOTES in cxa_thread_atexit_impl.c to know why
> +	     acquire is sufficient and correct.  */
> +	  && atomic_load_acquire (&l->l_tls_dtor_count) == 0

OK.

>  	  && !used[done_index])
>  	continue;
>  
> diff --git a/include/link.h b/include/link.h
> index 423a695..7e7eb79 100644
> --- a/include/link.h
> +++ b/include/link.h
> @@ -302,7 +302,9 @@ struct link_map
>      /* Index of the module in the dtv array.  */
>      size_t l_tls_modid;
>  
> -    /* Number of thread_local objects constructed by this DSO.  */
> +    /* Number of thread_local objects constructed by this DSO.  This is
> +       atomically accessed and modified and is not always protected by the load
> +       lock.  See also: CONCURRENCY NOTES in cxa_thread_atexit_impl.c.  */

Suggest: s/load lock/_dl_load_lock/g so we don't get confused in the future
         if we split the big lock.

>      size_t l_tls_dtor_count;
>  
>      /* Information used to change permission after the relocations are
> diff --git a/stdlib/Makefile b/stdlib/Makefile
> index 7fc5a80..402466a 100644
> --- a/stdlib/Makefile
> +++ b/stdlib/Makefile
> @@ -74,7 +74,7 @@ tests		:= tst-strtol tst-strtod testmb testrand testsort testdiv   \
>  		   tst-makecontext3 bug-getcontext bug-fmtmsg1		    \
>  		   tst-secure-getenv tst-strtod-overflow tst-strtod-round   \
>  		   tst-tininess tst-strtod-underflow tst-tls-atexit	    \
> -		   tst-setcontext3
> +		   tst-setcontext3 tst-tls-atexit-nodelete
>  tests-static	:= tst-secure-getenv
>  
>  modules-names	= tst-tls-atexit-lib
> @@ -159,6 +159,9 @@ tst-tls-atexit-lib.so-no-z-defs = yes
>  $(objpfx)tst-tls-atexit: $(shared-thread-library) $(libdl)
>  $(objpfx)tst-tls-atexit.out: $(objpfx)tst-tls-atexit-lib.so
>  
> +$(objpfx)tst-tls-atexit-nodelete: $(shared-thread-library) $(libdl)
> +$(objpfx)tst-tls-atexit-nodelete.out: $(objpfx)tst-tls-atexit-lib.so
> +
>  $(objpfx)tst-setcontext3.out: tst-setcontext3.sh $(objpfx)tst-setcontext3
>  	$(SHELL) $< $(common-objpfx) '$(test-program-prefix-before-env)' \
>  		 '$(run-program-env)' '$(test-program-prefix-after-env)' \
> diff --git a/stdlib/cxa_thread_atexit_impl.c b/stdlib/cxa_thread_atexit_impl.c
> index 7a7d3d6..dc51ab5 100644
> --- a/stdlib/cxa_thread_atexit_impl.c
> +++ b/stdlib/cxa_thread_atexit_impl.c
> @@ -16,6 +16,50 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> +/* CONCURRENCY NOTES:
> +

Great writeup.

> +   The functions __cxa_thread_atexit_impl, _dl_close_worker and
> +   __call_tls_dtors are the three main routines that may run concurrently and
> +   access shared data.  The shared data in all possible combinations of all
> +   three functions are the link map list, a link map for a DSO and the link map
> +   member l_tls_dtor_count.
> +
> +   __cxa_thread_atexit_impl takes the load_lock before modifying any shared
> +   state and hence can concurrently execute multiple of its instances in
> +   different threads safely.
> +
> +   _dl_close_worker takes the load_lock before modifying any shared state as
> +   well and hence can concurrently execute multiple of its own instances as
> +   well as those of __cxa_thread_atexit_impl safely.
> +
> +   __call_tls_dtors does not take the load lock, but reads only the link map
> +   of the current DSO and its member l_tls_dtor_count.  It has to ensure that
> +   l_tls_dtor_count is decremented atomically
> +
> +   Correspondingly, _dl_close_worker loads l_tls_dtor_count and if it is zero,
> +   unloads the DSO, thus dereferencing the current link map.  Hence, for
> +   concurrent execution of _dl_close_worker and __call_tls_dtors, it is
> +   necessary that:
> +
> +   1. The DSO unload in _dl_close_worker happens after l_tls_dtor_count, i.e.

Suggest:

... happens after l_tls_dtor_count load, i.e.

> +      the l_tls_dtor_count load does not get reordered to after the DSO is
> +      unloaded
> +   2. The link map dereference in __call_tls_dtors happens before the
> +      l_tls_dtor_count dereference.

Agreed.

> +
> +   to ensure that we eliminate the inconsistent state where the DSO is unloaded
> +   before it is dereferenced in __call_tls_dtors, which could happen if any of
> +   the above two pairs of sequences were reordered.
> +
> +   To ensure this, the l_tls_dtor_count decrement in __call_tls_dtors should
> +   have release semantics and the load in _dl_close_worker should have acquire
> +   semantics.
> +
> +   Concurrent executions of __call_tls_dtors should only ensure that the value
> +   is modified atomically; no reordering constraints need to be considered.
> +   Likewise for the increment of l_tls_dtor_count in
> +   __cxa_thread_atexit_impl.  */


OK.

> +
>  #include <stdlib.h>
>  #include <ldsodefs.h>
>  
> @@ -49,9 +93,11 @@ __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
>    new->next = tls_dtor_list;
>    tls_dtor_list = new;
>  
> -  /* See if we already encountered the DSO.  */
> +  /* We have to take the big lock to prevent a racing dlclose from pulling our
> +     DSO from underneath us while we're setting up our destructor.  */
>    __rtld_lock_lock_recursive (GL(dl_load_lock));

OK.

>  
> +  /* See if we already encountered the DSO.  */
>    if (__glibc_unlikely (dso_symbol_cache != dso_symbol))
>      {
>        ElfW(Addr) caller = (ElfW(Addr)) dso_symbol;
> @@ -62,16 +108,14 @@ __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
>  	 program (we hope).  */
>        lm_cache = l ? l : GL(dl_ns)[LM_ID_BASE]._ns_loaded;
>      }
> -  /* A destructor could result in a thread_local construction and the former
> -     could have cleared the flag.  */
> -  if (lm_cache->l_type == lt_loaded && lm_cache->l_tls_dtor_count == 0)
> -    lm_cache->l_flags_1 |= DF_1_NODELETE;
> -
> -  new->map = lm_cache;
> -  new->map->l_tls_dtor_count++;
>  
> +  /* Relaxed since the only shared state here is l_tls_dtor_count and hence
> +     there are no reordering issues.  */
> +  atomic_fetch_add_relaxed (&lm_cache->l_tls_dtor_count, 1);

OK.

I assume that this is OK because the _dl_load_lock mutex acts as a
release/acquire barrier which ensures that another thread would see
a consistent MAP if it itself took the _dl_load_lock lock. That is to say that
a racing _dl_close_worker would either see no new MAP or a new MAP with
a non-zero l_tls_dtor_count value (assuming the DSO had a thread_local
destructor).

>    __rtld_lock_unlock_recursive (GL(dl_load_lock));
>  
> +  new->map = lm_cache;
> +

OK.

>    return 0;
>  }
>  
> @@ -83,19 +127,15 @@ __call_tls_dtors (void)
>    while (tls_dtor_list)
>      {
>        struct dtor_list *cur = tls_dtor_list;
> -      tls_dtor_list = tls_dtor_list->next;
>  
> +      tls_dtor_list = tls_dtor_list->next;
>        cur->func (cur->obj);
>  
> -      __rtld_lock_lock_recursive (GL(dl_load_lock));
> -
> -      /* Allow DSO unload if count drops to zero.  */
> -      cur->map->l_tls_dtor_count--;
> -      if (cur->map->l_tls_dtor_count == 0 && cur->map->l_type == lt_loaded)
> -        cur->map->l_flags_1 &= ~DF_1_NODELETE;
> -
> -      __rtld_lock_unlock_recursive (GL(dl_load_lock));
> -
> +      /* Ensure that the MAP dereference is not reordered below the
> +	 l_tls_dtor_count decrement.  This ensures that it synchronizes with
> +	 the load in _dl_close_worker and keeps this dereference before the DSO
> +	 unload.  See CONCURRENCY NOTES for more detail.  */
> +      atomic_fetch_add_release (&cur->map->l_tls_dtor_count, -1);

OK.

>        free (cur);
>      }
>  }
> diff --git a/stdlib/tst-tls-atexit-nodelete.c b/stdlib/tst-tls-atexit-nodelete.c
> new file mode 100644
> index 0000000..ff290fa
> --- /dev/null
> +++ b/stdlib/tst-tls-atexit-nodelete.c
> @@ -0,0 +1,24 @@
> +/* Verify that a RTLD_NODELETE DSO is not unloaded even if its TLS objects are
> +   destroyed.
> +
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define NO_DELETE 1
> +#define H2_RTLD_FLAGS (RTLD_LAZY | RTLD_NODELETE)
> +#define LOADED_IS_GOOD true
> +#include "tst-tls-atexit.c"
> diff --git a/stdlib/tst-tls-atexit.c b/stdlib/tst-tls-atexit.c
> index cea655d..e9839d8 100644
> --- a/stdlib/tst-tls-atexit.c
> +++ b/stdlib/tst-tls-atexit.c
> @@ -16,12 +16,20 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -/* This test dynamically loads a DSO and spawns a thread that subsequently
> -   calls into the DSO to register a destructor for an object in the DSO and
> -   then calls dlclose on the handle for the DSO.  When the thread exits, the
> -   DSO should not be unloaded or else the destructor called during thread exit
> -   will crash.  Further in the main thread, the DSO is opened and closed again,
> -   at which point the DSO should be unloaded.  */
> +/* For the default case, i.e. NO_DELETE not defined, the test dynamically loads
> +   a DSO and spawns a thread that subsequently calls into the DSO to register a
> +   destructor for an object in the DSO and then calls dlclose on the handle for
> +   the DSO.  When the thread exits, the DSO should not be unloaded or else the
> +   destructor called during thread exit will crash.  Further in the main
> +   thread, the DSO is opened and closed again, at which point the DSO should be
> +   unloaded.
> +
> +   When NO_DELETE is defined, the DSO is loaded twice, once with just RTLD_LAZY
> +   flag and the second time with the RTLD_NODELETE flag set.  The thread is
> +   spawned, destructor registered and then thread exits without closing the
> +   DSO.  In the main thread, the first handle is then closed, followed by the
> +   second handle.  In the end, the DSO should remain loaded due to the
> +   RTLD_NODELETE flag being set in the second dlopen call.  */
>  
>  #include <dlfcn.h>
>  #include <pthread.h>
> @@ -31,6 +39,14 @@
>  #include <errno.h>
>  #include <link.h>
>  
> +#ifndef NO_DELETE
> +# define LOADED_IS_GOOD false
> +#endif
> +
> +#ifndef H2_RTLD_FLAGS
> +# define H2_RTLD_FLAGS (RTLD_LAZY)
> +#endif
> +
>  #define DSO_NAME "$ORIGIN/tst-tls-atexit-lib.so"
>  
>  /* Walk through the map in the _r_debug structure to see if our lib is still
> @@ -43,7 +59,10 @@ is_loaded (void)
>    for (; lm; lm = lm->l_next)
>      if (lm->l_type == lt_loaded && lm->l_name
>  	&& strcmp (basename (DSO_NAME), basename (lm->l_name)) == 0)
> -	  return true;
> +      {
> +	printf ("%s is still loaded\n", lm->l_name);
> +	return true;
> +      }
>    return false;
>  }
>  
> @@ -63,7 +82,9 @@ reg_dtor_and_close (void *h)
>  
>    reg_dtor ();
>  
> +#ifndef NO_DELETE
>    dlclose (h);
> +#endif
>  
>    return NULL;
>  }
> @@ -104,19 +125,30 @@ do_test (void)
>        return 1;
>      }
>  
> +#ifndef NO_DELETE
>    if (spawn_thread (h1) != 0)
>      return 1;
> +#endif
>  
> -  /* Now this should unload the DSO.  FIXME: This is a bug, calling dlclose
> -     like this is actually wrong, but it works because cxa_thread_atexit_impl
> -     has a bug which results in dlclose allowing this to work.  */
> -  dlclose (h1);
> +  void *h2 = dlopen (DSO_NAME, H2_RTLD_FLAGS);
> +  if (h2 == NULL)
> +    {
> +      printf ("h2: Unable to load DSO: %s\n", dlerror ());
> +      return 1;
> +    }
>  
> -  /* Check link maps to ensure that the DSO has unloaded.  */
> -  if (is_loaded ())
> +#ifdef NO_DELETE
> +  if (spawn_thread (h1) != 0)
>      return 1;
>  
> -  return 0;
> +  dlclose (h1);
> +#endif
> +  dlclose (h2);
> +
> +  /* Check link maps to ensure that the DSO has unloaded.  In the normal case,
> +     the DSO should be unloaded if there are no uses.  However, if one of the
> +     dlopen calls were with RTLD_NODELETE, the DSO should remain loaded.  */
> +  return is_loaded () == LOADED_IS_GOOD ? 0 : 1;
>  }
>  
>  #define TEST_FUNCTION do_test ()
> 

OK.

Cheers,
Carlos.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]