This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] [BZ 18034] [AArch64] Lazy TLSDESC relocation data race fix
- From: Szabolcs Nagy <szabolcs dot nagy at arm dot com>
- To: Torvald Riegel <triegel at redhat dot com>
- Cc: "libc-alpha at sourceware dot org" <libc-alpha at sourceware dot org>, Marcus Shawcroft <Marcus dot Shawcroft at arm dot com>, Ramana Radhakrishnan <Ramana dot Radhakrishnan at arm dot com>
- Date: Mon, 01 Jun 2015 11:25:05 +0100
- Subject: Re: [PATCH] [BZ 18034] [AArch64] Lazy TLSDESC relocation data race fix
- Authentication-results: sourceware.org; auth=none
- References: <553793A3 dot 7030206 at arm dot com> <1429718899 dot 6557 dot 17 dot camel at triegel dot csb> <553E5381 dot 504 at arm dot com> <1432672677 dot 26239 dot 41 dot camel at triegel dot csb> <5565AA2A dot 7010509 at arm dot com> <1432731762 dot 30849 dot 53 dot camel at triegel dot csb>
On 27/05/15 14:02, Torvald Riegel wrote:
> On Wed, 2015-05-27 at 12:27 +0100, Szabolcs Nagy wrote:
>> On 26/05/15 21:37, Torvald Riegel wrote:
>>> This should have relaxed atomic accesses for all things concurrently
>>> accessed. As a result, you can drop the volatile qualification I
removed volatile from aarch64 tlsdesc.c, but kept it in
elf/tlsdeschtab.h for now.
>>> You should also document why the relaxed MO load in
>>> _dl_tlsdesc_resolve_early_return_p is sufficient (see the other part of
i added a comment to the _dl_tlsdesc_resolve_early_return_p
call in aarch64 tlsdesc.c about the retry loop.
attached the updated patch:
- The c code now follows the glibc concurrency guidelines
(using relaxed atomics instead of volatile and release store
instead of barrier on the write side).
- Fixed the atomics in elf/tlsdeschtab.h too.
- Updated the comments.
the write is side now compiled to: (stlr has store-release semantics)
158: 91002261 add x1, x19, #0x8
15c: f9000020 str x0, [x1]
160: 90000000 adrp x0, 0 <_dl_tlsdesc_return_lazy>
160: R_AARCH64_ADR_PREL_PG_HI21 _dl_tlsdesc_return_lazy
164: 91000000 add x0, x0, #0x0
164: R_AARCH64_ADD_ABS_LO12_NC _dl_tlsdesc_return_lazy
168: c89ffe60 stlr x0, [x19]
the read side is: (ldar has load-acquire semantics)
0000000000000008 <_dl_tlsdesc_return_lazy>:
8: c8dffc1f ldar xzr, [x0]
c: f9400400 ldr x0, [x0,#8]
10: d65f03c0 ret
Changelog:
2015-06-01 Szabolcs Nagy <szabolcs.nagy@arm.com>
[BZ #18034]
* sysdeps/aarch64/dl-tlsdesc.h (_dl_tlsdesc_return_lazy): Declare.
* sysdeps/aarch64/dl-tlsdesc.S (_dl_tlsdesc_return_lazy): Define.
(_dl_tlsdesc_undefweak): Guarantee TLSDESC entry and argument load-load
ordering using ldar.
(_dl_tlsdesc_dynamic): Likewise.
(_dl_tlsdesc_return_lazy): Likewise.
* sysdeps/aarch64/tlsdesc.c (_dl_tlsdesc_resolve_rela_fixup): Use
relaxed atomics instead of volatile and synchronize with release store.
(_dl_tlsdesc_resolve_hold_fixup): Use relaxed atomics instead of
volatile.
* elf/tlsdeschtab.h (_dl_tlsdesc_resolve_early_return_p): Likewise.
diff --git a/elf/tlsdeschtab.h b/elf/tlsdeschtab.h
index d13b4e5..fb0eb88 100644
--- a/elf/tlsdeschtab.h
+++ b/elf/tlsdeschtab.h
@@ -20,6 +20,8 @@
#ifndef TLSDESCHTAB_H
# define TLSDESCHTAB_H 1
+#include <atomic.h>
+
# ifdef SHARED
# include <inline-hashtab.h>
@@ -138,17 +140,17 @@ _dl_make_tlsdesc_dynamic (struct link_map *map, size_t ti_offset)
static int
_dl_tlsdesc_resolve_early_return_p (struct tlsdesc volatile *td, void *caller)
{
- if (caller != td->entry)
+ if (caller != atomic_load_relaxed (&td->entry))
return 1;
__rtld_lock_lock_recursive (GL(dl_load_lock));
- if (caller != td->entry)
+ if (caller != atomic_load_relaxed (&td->entry))
{
__rtld_lock_unlock_recursive (GL(dl_load_lock));
return 1;
}
- td->entry = _dl_tlsdesc_resolve_hold;
+ atomic_store_relaxed (&td->entry, _dl_tlsdesc_resolve_hold);
return 0;
}
diff --git a/sysdeps/aarch64/dl-tlsdesc.S b/sysdeps/aarch64/dl-tlsdesc.S
index be9b9b3..c7adf79 100644
--- a/sysdeps/aarch64/dl-tlsdesc.S
+++ b/sysdeps/aarch64/dl-tlsdesc.S
@@ -79,6 +79,29 @@ _dl_tlsdesc_return:
cfi_endproc
.size _dl_tlsdesc_return, .-_dl_tlsdesc_return
+ /* Same as _dl_tlsdesc_return but with synchronization for
+ lazy relocation.
+ Prototype:
+ _dl_tlsdesc_return_lazy (tlsdesc *) ;
+ */
+ .hidden _dl_tlsdesc_return_lazy
+ .global _dl_tlsdesc_return_lazy
+ .type _dl_tlsdesc_return_lazy,%function
+ cfi_startproc
+ .align 2
+_dl_tlsdesc_return_lazy:
+ /* The ldar here happens after the load from [x0] at the call site
+ (that is generated by the compiler as part of the TLS access ABI),
+ so it reads the same value (this function is the final value of
+ td->entry) and thus it synchronizes with the release store to
+ td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
+ from [x0,#8] here happens after the initialization of td->arg. */
+ ldar xzr, [x0]
+ ldr x0, [x0, #8]
+ RET
+ cfi_endproc
+ .size _dl_tlsdesc_return_lazy, .-_dl_tlsdesc_return_lazy
+
/* Handler for undefined weak TLS symbols.
Prototype:
_dl_tlsdesc_undefweak (tlsdesc *);
@@ -96,6 +119,13 @@ _dl_tlsdesc_return:
_dl_tlsdesc_undefweak:
str x1, [sp, #-16]!
cfi_adjust_cfa_offset(16)
+ /* The ldar here happens after the load from [x0] at the call site
+ (that is generated by the compiler as part of the TLS access ABI),
+ so it reads the same value (this function is the final value of
+ td->entry) and thus it synchronizes with the release store to
+ td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
+ from [x0,#8] here happens after the initialization of td->arg. */
+ ldar xzr, [x0]
ldr x0, [x0, #8]
mrs x1, tpidr_el0
sub x0, x0, x1
@@ -152,6 +182,13 @@ _dl_tlsdesc_dynamic:
stp x3, x4, [sp, #32+16*1]
mrs x4, tpidr_el0
+ /* The ldar here happens after the load from [x0] at the call site
+ (that is generated by the compiler as part of the TLS access ABI),
+ so it reads the same value (this function is the final value of
+ td->entry) and thus it synchronizes with the release store to
+ td->entry in _dl_tlsdesc_resolve_rela_fixup ensuring that the load
+ from [x0,#8] here happens after the initialization of td->arg. */
+ ldar xzr, [x0]
ldr x1, [x0,#8]
ldr x0, [x4]
ldr x3, [x1,#16]
diff --git a/sysdeps/aarch64/dl-tlsdesc.h b/sysdeps/aarch64/dl-tlsdesc.h
index 7a1285e..e6c0078 100644
--- a/sysdeps/aarch64/dl-tlsdesc.h
+++ b/sysdeps/aarch64/dl-tlsdesc.h
@@ -46,6 +46,9 @@ extern ptrdiff_t attribute_hidden
_dl_tlsdesc_return (struct tlsdesc *);
extern ptrdiff_t attribute_hidden
+_dl_tlsdesc_return_lazy (struct tlsdesc *);
+
+extern ptrdiff_t attribute_hidden
_dl_tlsdesc_undefweak (struct tlsdesc *);
extern ptrdiff_t attribute_hidden
diff --git a/sysdeps/aarch64/tlsdesc.c b/sysdeps/aarch64/tlsdesc.c
index 4821f8c..d2f1cd5 100644
--- a/sysdeps/aarch64/tlsdesc.c
+++ b/sysdeps/aarch64/tlsdesc.c
@@ -25,6 +25,7 @@
#include <dl-tlsdesc.h>
#include <dl-unmap-segments.h>
#include <tlsdeschtab.h>
+#include <atomic.h>
/* The following functions take an entry_check_offset argument. It's
computed by the caller as an offset between its entry point and the
@@ -39,11 +40,12 @@
void
attribute_hidden
-_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
- struct link_map *l)
+_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc *td, struct link_map *l)
{
- const ElfW(Rela) *reloc = td->arg;
+ const ElfW(Rela) *reloc = atomic_load_relaxed (&td->arg);
+ /* Return and thus retry calling td->entry if it changes before
+ GL(dl_load_lock) is grabbed. */
if (_dl_tlsdesc_resolve_early_return_p
(td, (void*)(D_PTR (l, l_info[ADDRIDX (DT_TLSDESC_PLT)]) + l->l_addr)))
return;
@@ -86,8 +88,10 @@ _dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
if (!sym)
{
- td->arg = (void*) reloc->r_addend;
- td->entry = _dl_tlsdesc_undefweak;
+ atomic_store_relaxed (&td->arg, (void *) reloc->r_addend);
+ /* This release store synchronizes with the ldar acquire load
+ instruction in _dl_tlsdesc_undefweak. */
+ atomic_store_release (&td->entry, _dl_tlsdesc_undefweak);
}
else
{
@@ -96,16 +100,22 @@ _dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
# else
if (!TRY_STATIC_TLS (l, result))
{
- td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value
+ void *p = _dl_make_tlsdesc_dynamic (result, sym->st_value
+ reloc->r_addend);
- td->entry = _dl_tlsdesc_dynamic;
+ atomic_store_relaxed (&td->arg, p);
+ /* This release store synchronizes with the ldar acquire load
+ instruction in _dl_tlsdesc_dynamic. */
+ atomic_store_release (&td->entry, _dl_tlsdesc_dynamic);
}
else
# endif
{
- td->arg = (void*) (sym->st_value + result->l_tls_offset
+ void *p = (void*) (sym->st_value + result->l_tls_offset
+ reloc->r_addend);
- td->entry = _dl_tlsdesc_return;
+ atomic_store_relaxed (&td->arg, p);
+ /* This release store synchronizes with the ldar acquire load
+ instruction in _dl_tlsdesc_return_lazy. */
+ atomic_store_release (&td->entry, _dl_tlsdesc_return_lazy);
}
}
@@ -120,11 +130,10 @@ _dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td,
void
attribute_hidden
-_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td,
- void *caller)
+_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc *td, void *caller)
{
/* Maybe we're lucky and can return early. */
- if (caller != td->entry)
+ if (caller != atomic_load_relaxed (&td->entry))
return;
/* Locking here will stop execution until the running resolver runs