From c1c340aa1c653014394dbe280a85515c9a7603f4 Mon Sep 17 00:00:00 2001 From: Philipp Trommler Date: Fri, 8 Apr 2016 10:48:41 +0200 Subject: [PATCH] Fix TLS performance degradation after dlopen --- elf/dl-close.c | 2 ++ elf/dl-open.c | 10 ++++++++-- elf/dl-tls.c | 35 ++++++++++++++++++++++------------- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/elf/dl-close.c b/elf/dl-close.c index 687d7de..f1d008f 100644 --- a/elf/dl-close.c +++ b/elf/dl-close.c @@ -32,6 +32,7 @@ #include #include #include +#include #include @@ -753,6 +754,7 @@ _dl_close_worker (struct link_map *map, bool force) /* If we removed any object which uses TLS bump the generation counter. */ if (any_tls) { + atomic_write_barrier (); if (__glibc_unlikely (++GL(dl_tls_generation) == 0)) _dl_fatal_printf ("TLS generation counter wrapped! Please report as described in "REPORT_BUGS_TO".\n"); diff --git a/elf/dl-open.c b/elf/dl-open.c index 6f178b3..e696c4e 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -524,9 +524,15 @@ dl_open_worker (void *a) } /* Bump the generation number if necessary. */ - if (any_tls && __builtin_expect (++GL(dl_tls_generation) == 0, 0)) - _dl_fatal_printf (N_("\ + if (any_tls) + { + atomic_write_barrier (); + if (__builtin_expect (++GL(dl_tls_generation) == 0, 0)) + { + _dl_fatal_printf (N_("\ TLS generation counter wrapped! Please report this.")); + } + } /* We need a second pass for static tls data, because _dl_update_slotinfo must not be run while calls to _dl_add_to_slotinfo are still pending. */ diff --git a/elf/dl-tls.c b/elf/dl-tls.c index ed13fd9..7c2c5c7 100644 --- a/elf/dl-tls.c +++ b/elf/dl-tls.c @@ -629,12 +629,16 @@ _dl_update_slotinfo (unsigned long int req_modid) possible that other threads at the same time dynamically load code and therefore add to the slotinfo list. This is a problem since we must not pick up any information about incomplete work. - The solution to this is to ignore all dtv slots which were - created after the one we are currently interested. We know that - dynamic loading for this module is completed and this is the last - load operation we know finished. */ + The solution to this is to ensure that we capture the current + generation count before walking the list, and ignore any slots + which are marked with a higher generation count, since they may + still be in the process of being updated. */ unsigned long int idx = req_modid; struct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list); + size_t new_gen = GL(dl_tls_generation); + + /* Make sure we don't read _dl_tls_generation too late. */ + atomic_read_barrier(); while (idx >= listp->len) { @@ -642,13 +646,8 @@ _dl_update_slotinfo (unsigned long int req_modid) listp = listp->next; } - if (dtv[0].counter < listp->slotinfo[idx].gen) { - /* The generation counter for the slot is higher than what the - current dtv implements. We have to update the whole dtv but - only those entries with a generation counter <= the one for - the entry we need. */ - size_t new_gen = listp->slotinfo[idx].gen; + /* Tilera: leave scope to make diffs against our baseline easier. */ size_t total = 0; /* We have to look through the entire dtv slotinfo list. */ @@ -916,7 +915,7 @@ _dl_add_to_slotinfo (struct link_map *l) the first slot. */ assert (idx == 0); - listp = prevp->next = (struct dtv_slotinfo_list *) + listp = (struct dtv_slotinfo_list *) malloc (sizeof (struct dtv_slotinfo_list) + TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo)); if (listp == NULL) @@ -939,9 +938,19 @@ cannot create TLS data structures")); listp->next = NULL; memset (listp->slotinfo, '\0', TLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo)); + + /* Make sure the new slotinfo element is in memory before exposing it. */ + atomic_write_barrier(); + prevp->next = listp; } - /* Add the information into the slotinfo data structure. */ - listp->slotinfo[idx].map = l; + /* Add the information into the slotinfo data structure. Make sure + to barrier as we do it, to present a consistent view to other + threads that may be calling __tls_get_addr() as we do this. + Barrier after the writes so we know everything is ready for an + increment of _dl_tls_generation. */ listp->slotinfo[idx].gen = GL(dl_tls_generation) + 1; + atomic_write_barrier(); + listp->slotinfo[idx].map = l; + atomic_write_barrier(); } -- 2.5.0