This is the mail archive of the mailing list for the glibc project.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Fixes TLS performance degradation after dlopen

Fixes a performance degradation of TLS access in shared libraries which
can be observed after another shared library that uses TLS is loaded
via dl_open. In this case __tls_get_addr takes significant more time.
Once that shared library accesses it's TLS, the performance normalizes.

Attached is a minimal example which gives the following output:

time: 0.723744
after dlopen,
time: 1.789016
after tls access in loaded lib,
time: 0.672865

We do have a use-case where this is actually really significant. I
believe this happens for instance if libstdc++ is loaded implicitly,
but TLS features are not actively used.

I strongly suspect this is the same issue as discussed in this post on
the ÂClibc mailing list:

and therefore the patch provided mainly reuses the solution they've
found for ÂClibc.

Main development platform information (tested on multiple platforms):

* latest glibc git version (pulled at Tue JunÂÂ7 15:45:13 CEST 2016)
* x86_64 (but should be independent)
*ÂLinux TP_Trommler 4.5.4-1-ARCH #1 SMP PREEMPT Wed May 11 22:21:28
CEST 2016 x86_64 GNU/Linux
*Âgcc (GCC) 6.1.1 20160501
*ÂGNU ld (GNU Binutils)

The patch provided touches code already altered (but not released) by
the work on bug 19329.

2016-06-07 ÂPhilipp Trommler Â<>

	[BZ #19924]
	* elf/dl-close.c: Port changes from the ÂClibc to fix
	performance degradation after dl_open
	* elf/dl-open.c: Likewise
	* elf/dl-tls.c: Likewise

Âelf/dl-close.c |ÂÂ2 ++
Âelf/dl-open.cÂÂ| 10 ++++++++--
Âelf/dl-tls.cÂÂÂ| 35 ++++++++++++++++++++++-------------
Â3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/elf/dl-close.c b/elf/dl-close.c
index 687d7de..f1d008f 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -32,6 +32,7 @@
Â#include <sysdep-cancel.h>
Â#include <tls.h>
Â#include <stap-probe.h>
+#include <atomic.h>
Â#include <dl-unmap-segments.h>
@@ -753,6 +754,7 @@ _dl_close_worker (struct link_map *map, bool force)
ÂÂÂ/* If we removed any object which uses TLS bump the generation
ÂÂÂif (any_tls)
+ÂÂÂÂÂÂatomic_write_barrier ();
ÂÂÂÂÂÂÂif (__glibc_unlikely (++GL(dl_tls_generation) == 0))
Â	_dl_fatal_printf ("TLS generation counter wrapped!ÂÂPlease
report as described in "REPORT_BUGS_TO".\n");
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 6f178b3..e696c4e 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -524,9 +524,15 @@ dl_open_worker (void *a)
ÂÂÂ/* Bump the generation number if necessary.ÂÂ*/
-ÂÂif (any_tls && __builtin_expect (++GL(dl_tls_generation) == 0, 0))
-ÂÂÂÂ_dl_fatal_printf (N_("\
+ÂÂif (any_tls)
+ÂÂÂÂÂÂatomic_write_barrier ();
+ÂÂÂÂÂÂif (__builtin_expect (++GL(dl_tls_generation) == 0, 0))
+ÂÂÂÂÂÂÂÂÂÂ_dl_fatal_printf (N_("\
ÂTLS generation counter wrapped!ÂÂPlease report this."));
ÂÂÂ/* We need a second pass for static tls data, because
ÂÂÂÂÂÂmust not be run while calls to _dl_add_to_slotinfo are still
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index ed13fd9..7c2c5c7 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -629,12 +629,16 @@ _dl_update_slotinfo (unsigned long int req_modid)
ÂÂÂÂÂÂpossible that other threads at the same time dynamically load
ÂÂÂÂÂÂcode and therefore add to the slotinfo list.ÂÂThis is a problem
ÂÂÂÂÂÂsince we must not pick up any information about incomplete work.
-ÂÂÂÂÂThe solution to this is to ignore all dtv slots which were
-ÂÂÂÂÂcreated after the one we are currently interested.ÂÂWe know that
-ÂÂÂÂÂdynamic loading for this module is completed and this is the last
-ÂÂÂÂÂload operation we know finished.ÂÂ*/
+ÂÂÂÂÂThe solution to this is to ensure that we capture the current
+ÂÂÂÂÂgeneration count before walking the list, and ignore any slots
+ÂÂÂÂÂwhich are marked with a higher generation count, since they may
+ÂÂÂÂÂstill be in the process of being updated.ÂÂ*/
ÂÂÂunsigned long int idx = req_modid;
ÂÂÂstruct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
+ÂÂsize_t new_gen = GL(dl_tls_generation);
+ÂÂ/* Make sure we don't read _dl_tls_generation too late. */
ÂÂÂwhile (idx >= listp->len)
@@ -642,13 +646,8 @@ _dl_update_slotinfo (unsigned long int req_modid)
ÂÂÂÂÂÂÂlistp = listp->next;
-ÂÂif (dtv[0].counter < listp->slotinfo[idx].gen)
-ÂÂÂÂÂÂ/* The generation counter for the slot is higher than what the
-	Âcurrent dtv implements.ÂÂWe have to update the whole dtv but
-	Âonly those entries with a generation counter <= the one for
-	Âthe entry we need.ÂÂ*/
-ÂÂÂÂÂÂsize_t new_gen = listp->slotinfo[idx].gen;
+ÂÂÂÂÂÂ/* Tilera: leave scope to make diffs against our baseline
easier. */
ÂÂÂÂÂÂÂsize_t total = 0;
ÂÂÂÂÂÂÂ/* We have to look through the entire dtv slotinfo list.ÂÂ*/
@@ -916,7 +915,7 @@ _dl_add_to_slotinfo (struct link_map *l)
Â	Âthe first slot.ÂÂ*/
ÂÂÂÂÂÂÂassert (idx == 0);
-ÂÂÂÂÂÂlistp = prevp->next = (struct dtv_slotinfo_list *)
+ÂÂÂÂÂÂlistp = (struct dtv_slotinfo_list *)
Â	malloc (sizeof (struct dtv_slotinfo_list)
Â		+ TLS_SLOTINFO_SURPLUS * sizeof (struct
ÂÂÂÂÂÂÂif (listp == NULL)
@@ -939,9 +938,19 @@ cannot create TLS data structures"));
ÂÂÂÂÂÂÂlistp->next = NULL;
ÂÂÂÂÂÂÂmemset (listp->slotinfo, '\0',
Â	ÂÂÂÂÂÂTLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
+ÂÂÂÂÂÂ/* Make sure the new slotinfo element is in memory before
exposing it. */
+ÂÂÂÂÂÂprevp->next = listp;
-ÂÂ/* Add the information into the slotinfo data structure.ÂÂ*/
-ÂÂlistp->slotinfo[idx].map = l;
+ÂÂ/* Add the information into the slotinfo data structure.ÂÂMake sure
+ÂÂÂÂÂto barrier as we do it, to present a consistent view to other
+ÂÂÂÂÂthreads that may be calling __tls_get_addr() as we do this.
+ÂÂÂÂÂBarrier after the writes so we know everything is ready for an
+ÂÂÂÂÂincrement of _dl_tls_generation.ÂÂ*/
ÂÂÂlistp->slotinfo[idx].gen = GL(dl_tls_generation) + 1;
+ÂÂlistp->slotinfo[idx].map = l;
__thread int dummy_tls_for_lib = 23;

void dummy_access()

default: tls_mini

CC = gcc
CFLAGS = -fpic -std=gnu11 -Wall -pedantic -Werror
LDFLAGS = -ldl

%.o: %.c
	$(CC) -c $< -o $@ $(CFLAGS) dummy.o
	$(CC) -shared dummy.o -o $(CFLAGS) tls_access.o
	$(CC) -shared tls_access.o -o $(CFLAGS)

tls_mini: tls_mini.o
	$(CC) tls_mini.o -o $@ $(CFLAGS) -L. -ltls_access $(LDFLAGS)
extern __thread volatile long tls_data;

void tls_access()
    tls_data += 2;

#include <stdio.h>
#include <dlfcn.h>
#include <stdlib.h>
#include <time.h>

__thread volatile long tls_data;

static inline double gtod()
    struct timespec tp;
    clock_gettime(CLOCK_REALTIME, &tp);
    return ((double)tp.tv_sec + tp.tv_nsec / 1.0e9);

void tls_access();

void tls_work()
    double start = gtod();
    for (long long i = 0; i < 100000000; i++) {
    double end = gtod();
    printf("time: %f\n", end-start);

int main()

    void* handle = dlopen("./", RTLD_NOW);
    if (!handle) {
        fprintf(stderr, "%s\n", dlerror());


    void (*dummy_access)(void);
    *(void**)(&dummy_access) = dlsym(handle, "dummy_access");
    if (!dummy_access) {
        fprintf(stderr, "%s\n", dlerror());

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]