[PATCH] Use IE model for static variables in glibc

The recently introduced TLS variables in the thread-local destructor
implementation (__cxa_thread_atexit_impl) used the default GD access
model, resulting in a call to __tls_get_addr.  This causes a deadlock
with recent changes to the way TLS is initialized because DTV
allocations are delayed and hence despite knowing the offset to the
variable inside its TLS block, the thread has to take the global rtld
lock to safely update the TLS offset.

This causes deadlocks when a thread is instantiated and joined inside
a destructor of a dlopen'd DSO.  The correct long term fix is to
somehow not take the lock, but that will need a lot deeper change set
to alter the way in which the big rtld lock is used.

Instead, this patch just eliminates the call to __tls_get_addr for the
thread-local variables inside  The variables changed are the
3 in cxa_thread_atexit and the strerror thread-local variable.

There were concerns that the static storage for TLS is limited and
hence we should not be using it.  Additionally, dynamically loaded
modules may result in looking for this static storage pretty
late in static binaries.  Both concerns are valid when using TLSDESC
since that is where one may attempt to allocate a TLS block from
static storage for even those variables that are not IE.  They're not
very strong arguments for the traditional TLS model though, since it
assumes that the static storage would be used sparingly and definitely
not by default.  Hence, for now this would only theoretically affect
ARM architectures.

The impact is hence limited to statically linked binaries that dlopen
modules that in turn load, all that on arm hardware.  It seems
like a small enough impact to justify fixing the larger problem that
currently affects everything everywhere.

This still does not solve the original problem completely.  That is,
it is still possible to deadlock on the big rtld lock with a small
tweak to the test case attached to this patch.  That problem is
however not a regression in 2.22 and hence could be tackled as a
separate project.  The test case is picked up as is from Alex's patch.

This change has been tested to verify that it does not cause any
issues on x86_64.  I don't have arm hardware handy to verify that
everything works OK, but I don't think there are tests to stress the
static TLS block, so it shouldn't make a difference.


	[BZ #18457]
	* nptl/Makefile (tests): New test case tst-join7.
	(modules-names): New test case module tst-join7mod.
	* nptl/tst-join7.c: New file.
	* nptl/tst-join7mod.c: New file.
	* stdlib/cxa_thread_atexit_impl.c (tls_dtor_list,
	dso_symbol_cache, lm_cache): Mark variables as IE.
	* string/strerror_l.c (last_value): Likewise.
 nptl/Makefile                   | 10 ++++++++--
 nptl/tst-join7.c                | 12 ++++++++++++
 nptl/tst-join7mod.c             | 30 ++++++++++++++++++++++++++++++
 stdlib/cxa_thread_atexit_impl.c |  6 +++---
 string/strerror_l.c             |  2 +-
 5 files changed, 54 insertions(+), 6 deletions(-)
 create mode 100644 nptl/tst-join7.c
 create mode 100644 nptl/tst-join7mod.c

diff --git a/nptl/Makefile b/nptl/Makefile
index 4544aa2..f14f4d6 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -245,7 +245,7 @@ tests = tst-typesizes \
 	tst-basic7 \
 	tst-kill1 tst-kill2 tst-kill3 tst-kill4 tst-kill5 tst-kill6 \
 	tst-raise1 \
-	tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 \
+	tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 tst-join7 \
 	tst-detach1 \
 	tst-eintr1 tst-eintr2 tst-eintr3 tst-eintr4 tst-eintr5 \
 	tst-tsd1 tst-tsd2 tst-tsd3 tst-tsd4 tst-tsd5 tst-tsd6 \
@@ -323,7 +323,8 @@ endif
 modules-names = tst-atfork2mod tst-tls3mod tst-tls4moda tst-tls4modb \
 		tst-tls5mod tst-tls5moda tst-tls5modb tst-tls5modc \
 		tst-tls5modd tst-tls5mode tst-tls5modf tst-stack4mod \
-		tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod
+		tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod \
+		tst-join7mod
 extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) tst-cleanup4aux.o
 test-extras += $(modules-names) tst-cleanup4aux
 test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names)))
@@ -528,6 +529,11 @@ $(objpfx)tst-tls6.out: $(objpfx)tst-tls5 \
+$(objpfx)tst-join7: $(libdl) $(shared-thread-library)
+$(objpfx)tst-join7.out: $(objpfx)
+$(objpfx) $(shared-thread-library) = -Wl,-soname,
 $(objpfx)tst-dlsym1: $(libdl) $(shared-thread-library)
 $(objpfx)tst-fini1: $(shared-thread-library) $(objpfx)
diff --git a/nptl/tst-join7.c b/nptl/tst-join7.c
new file mode 100644
index 0000000..bf6fc76
--- /dev/null
+++ b/nptl/tst-join7.c
@@ -0,0 +1,12 @@
+#include <dlfcn.h>
+do_test (void)
+  void *f = dlopen ("", RTLD_NOW | RTLD_GLOBAL);
+  if (f) dlclose (f); else return 1;
+  return 0;
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/nptl/tst-join7mod.c b/nptl/tst-join7mod.c
new file mode 100644
index 0000000..9960b76
--- /dev/null
+++ b/nptl/tst-join7mod.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <pthread.h>
+#include <atomic.h>
+static pthread_t th;
+static int running = 1;
+static void *
+test_run (void *p)
+  while (atomic_load_relaxed (&running))
+    printf ("XXX test_run\n");
+  printf ("XXX test_run FINISHED\n");
+  return NULL;
+static void __attribute__ ((constructor))
+do_init (void)
+  pthread_create (&th, NULL, test_run, NULL);
+static void __attribute__ ((destructor))
+do_end (void)
+  atomic_store_relaxed (&running, 0);
+  printf ("thread_join...\n");
+  pthread_join (th, NULL);
+  printf ("thread_join DONE\n");
diff --git a/stdlib/cxa_thread_atexit_impl.c b/stdlib/cxa_thread_atexit_impl.c
index 54e2812..9120162 100644
--- a/stdlib/cxa_thread_atexit_impl.c
+++ b/stdlib/cxa_thread_atexit_impl.c
@@ -29,9 +29,9 @@ struct dtor_list
   struct dtor_list *next;
-static __thread struct dtor_list *tls_dtor_list;
-static __thread void *dso_symbol_cache;
-static __thread struct link_map *lm_cache;
+static __thread struct dtor_list *tls_dtor_list attribute_tls_model_ie;
+static __thread void *dso_symbol_cache attribute_tls_model_ie;
+static __thread struct link_map *lm_cache attribute_tls_model_ie;
 /* Register a destructor for TLS variables declared with the 'thread_local'
    keyword.  This function is only called from code generated by the C++
diff --git a/string/strerror_l.c b/string/strerror_l.c
index 2ed78b5..0b8bf2a 100644
--- a/string/strerror_l.c
+++ b/string/strerror_l.c
@@ -23,7 +23,7 @@
 #include <sys/param.h>
-static __thread char *last_value;
+static __thread char *last_value attribute_tls_model_ie;
 static const char *

