This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH][BZ 18572][ARM] Fix lazy TLSDESC race on arm
- From: Szabolcs Nagy <szabolcs dot nagy at arm dot com>
- To: GNU C Library <libc-alpha at sourceware dot org>
- Cc: Marcus Shawcroft <Marcus dot Shawcroft at arm dot com>, Ramana Radhakrishnan <Ramana dot Radhakrishnan at arm dot com>
- Date: Thu, 12 Nov 2015 11:31:46 +0000
- Subject: [PATCH][BZ 18572][ARM] Fix lazy TLSDESC race on arm
- Authentication-results: sourceware.org; auth=none
Lazily initialized TLS descriptors have a race on ARM, so
concurrent TLS access can cause crashes with -mtls-dialect=gnu2.
See the fix and description for AArch64:
https://sourceware.org/ml/libc-alpha/2015-06/msg00496.html
Differences compared to AArch64:
- Various versions of ARM have different synchronization methods,
defined a DMB() macro that selects the right memory barrier at
compile-time (since compiler builtins are not available in asm
and it seems glibc does not do runtime dispatch).
- tls descriptor is (arg,f) on arm instead of (f,arg), this makes
the "load f again with acquire mo" solution of AArch64 less
attractive. (LDA on AArch32 only works with 0 offset and with
an extra ADD the armv8 case would need more special casing with
questionable benefits).
- the argument is different (symbol table index instead of a
pointer to the relocation entry) so symbol lookup is different
and there is no simple way to have common tlsdesc.c across
different archs.
- _dl_tlsdesc_undefweak does not access the argument so it needs
no synchronization.
The C code uses atomic_store_release for lazy initialization and
the asm uses DMB in the resolvers: every tls access will go
through an extra barrier (which must have a performance impact).
Tested on armv7-a.
2015-11-12 Szabolcs Nagy <szabolcs.nagy@arm.com>
[BZ #18572]
* sysdeps/arm/sysdep.h (DMB): Define.
* sysdeps/arm/dl-tlsdesc.h (_dl_tlsdesc_return_lazy): Declare.
* sysdeps/arm/dl-tlsdesc.S (_dl_tlsdesc_return_lazy): Define.
(_dl_tlsdesc_dynamic): Guarantee TLSDESC entry and argument load-load
ordering using DMB.
* sysdeps/arm/tlsdesc.c (_dl_tlsdesc_lazy_resolver_fixup): Use relaxed
atomics instead of volatile and synchronize using release store.
(_dl_tlsdesc_resolve_hold_fixup): Use relaxed atomics instead of
volatile.
diff --git a/sysdeps/arm/dl-tlsdesc.S b/sysdeps/arm/dl-tlsdesc.S
index 33a2695..1d28cae 100644
--- a/sysdeps/arm/dl-tlsdesc.S
+++ b/sysdeps/arm/dl-tlsdesc.S
@@ -39,6 +39,25 @@ _dl_tlsdesc_return:
cfi_endproc
.size _dl_tlsdesc_return, .-_dl_tlsdesc_return
+ .hidden _dl_tlsdesc_return_lazy
+ .global _dl_tlsdesc_return_lazy
+ .type _dl_tlsdesc_return_lazy,#function
+ cfi_startproc
+ eabi_fnstart
+ .align 2
+_dl_tlsdesc_return_lazy:
+ /* This DMB synchronizes the load of td->entry ([r0,#4])
+ at the call site with the release store to it in
+ _dl_tlsdesc_lazy_resolver_fixup so the load from [r0]
+ here happens after the initialization of td->argument. */
+ DMB ()
+ sfi_breg r0, \
+ ldr r0, [\B]
+ BX (lr)
+ eabi_fnend
+ cfi_endproc
+ .size _dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy
+
.hidden _dl_tlsdesc_undefweak
.global _dl_tlsdesc_undefweak
.type _dl_tlsdesc_undefweak,#function
@@ -92,6 +111,11 @@ _dl_tlsdesc_dynamic:
cfi_rel_offset (r3,4)
cfi_rel_offset (r4,8)
cfi_rel_offset (lr,12)
+ /* This DMB synchronizes the load of td->entry ([r0,#4])
+ at the call site with the release store to it in
+ _dl_tlsdesc_lazy_resolver_fixup so the load from [r0]
+ here happens after the initialization of td->argument. */
+ DMB ()
sfi_breg r0, \
ldr r1, [\B] /* td */
GET_TLS (lr)
diff --git a/sysdeps/arm/dl-tlsdesc.h b/sysdeps/arm/dl-tlsdesc.h
index e17b0bd..26fdb62 100644
--- a/sysdeps/arm/dl-tlsdesc.h
+++ b/sysdeps/arm/dl-tlsdesc.h
@@ -48,6 +48,7 @@ struct tlsdesc_dynamic_arg
extern ptrdiff_t attribute_hidden
_dl_tlsdesc_return(struct tlsdesc *),
+ _dl_tlsdesc_return_lazy(struct tlsdesc *),
_dl_tlsdesc_undefweak(struct tlsdesc *),
_dl_tlsdesc_resolve_hold(struct tlsdesc *),
_dl_tlsdesc_lazy_resolver(struct tlsdesc *);
diff --git a/sysdeps/arm/sysdep.h b/sysdeps/arm/sysdep.h
index 9bbd009..c767c56 100644
--- a/sysdeps/arm/sysdep.h
+++ b/sysdeps/arm/sysdep.h
@@ -277,6 +277,15 @@
cfi_restore_state
# endif /* ARCH_HAS_HARD_TP */
+/* Memory barrier for TLSDESC resolvers. */
+#if __ARM_ARCH >= 7
+# define DMB() dmb ish
+#elif __ARM_ARCH >= 6
+# define DMB() mcr p15, 0, r0, c7, c10, 5
+#else
+# define DMB() bl __sync_synchronize
+#endif
+
# ifndef ARM_SFI_MACROS
# define ARM_SFI_MACROS 1
/* This assembly macro is prepended to any load/store instruction,
diff --git a/sysdeps/arm/tlsdesc.c b/sysdeps/arm/tlsdesc.c
index ca941b8..27101e6 100644
--- a/sysdeps/arm/tlsdesc.c
+++ b/sysdeps/arm/tlsdesc.c
@@ -23,6 +23,7 @@
#include <dl-tlsdesc.h>
#include <dl-unmap-segments.h>
#include <tlsdeschtab.h>
+#include <atomic.h>
/* This function is used to lazily resolve TLS_DESC REL relocations
Besides the TLS descriptor itself, we get the module's got address
@@ -30,22 +31,26 @@
void
attribute_hidden
-_dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
- Elf32_Addr *got)
+_dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc *td, Elf32_Addr *got)
{
struct link_map *l = (struct link_map *)got[1];
lookup_t result;
- unsigned long value;
+ unsigned long value = atomic_load_relaxed (&td->argument.value);
+ /* After GL(dl_load_lock) is grabbed only one caller can see td->entry in
+ initial state in _dl_tlsdesc_resolve_early_return_p, other concurrent
+ callers will return and retry calling td->entry. The updated td->entry
+ synchronizes with the single writer so all read accesses here can use
+ relaxed order. */
if (_dl_tlsdesc_resolve_early_return_p
(td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr)))
return;
- if (td->argument.value & 0x80000000)
+ if (value & 0x80000000)
{
/* A global symbol, this is the symbol index. */
/* The code below was borrowed from _dl_fixup(). */
- const Elf_Symndx symndx = td->argument.value ^ 0x80000000;
+ const Elf_Symndx symndx = value ^ 0x80000000;
const ElfW(Sym) *const symtab
= (const void *) D_PTR (l, l_info[DT_SYMTAB]);
const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]);
@@ -76,7 +81,9 @@ _dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
value = sym->st_value;
else
{
- td->entry = _dl_tlsdesc_undefweak;
+ /* _dl_tlsdesc_undefweak does not access td->argument
+ so this store can be relaxed. */
+ atomic_store_relaxed (&td->entry, _dl_tlsdesc_undefweak);
goto done;
}
}
@@ -92,7 +99,6 @@ _dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
{
/* A local symbol, this is the offset within our tls section.
*/
- value = td->argument.value;
result = l;
}
@@ -101,14 +107,19 @@ _dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
#else
if (!TRY_STATIC_TLS (l, result))
{
- td->argument.pointer = _dl_make_tlsdesc_dynamic (result, value);
- td->entry = _dl_tlsdesc_dynamic;
+ void *p = _dl_make_tlsdesc_dynamic (result, value);
+ atomic_store_relaxed (&td->argument.pointer, p);
+ /* This release store synchronizes with the DMB in
+ _dl_tlsdesc_dynamic (relying on the arm memory model). */
+ atomic_store_release (&td->entry, _dl_tlsdesc_dynamic);
}
else
#endif
{
- td->argument.value = value + result->l_tls_offset;
- td->entry = _dl_tlsdesc_return;
+ atomic_store_relaxed (&td->argument.value, value + result->l_tls_offset);
+ /* This release store synchronizes with the DMB in
+ _dl_tlsdesc_return_lazy (relying on the arm memory model). */
+ atomic_store_release (&td->entry, _dl_tlsdesc_return_lazy);
}
done:
@@ -123,11 +134,10 @@ _dl_tlsdesc_lazy_resolver_fixup (struct tlsdesc volatile *td,
void
attribute_hidden
-_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td,
- void *caller)
+_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc *td, void *caller)
{
/* Maybe we're lucky and can return early. */
- if (caller != td->entry)
+ if (caller != atomic_load_relaxed (&td->entry))
return;
/* Locking here will stop execution until the running resolver runs