This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Fixes TLS performance degradation after dlopen


Fixes a performance degradation of TLS access in shared libraries which
can be observed after another shared library that uses TLS is loaded
via dl_open. In this case __tls_get_addr takes significant more time.
Once that shared library accesses it's TLS, the performance normalizes.

Attached is a minimal example which gives the following output:

time: 0.723744
after dlopen,
time: 1.789016
after tls access in loaded lib,
time: 0.672865

We do have a use-case where this is actually really significant. I
believe this happens for instance if libstdc++ is loaded implicitly,
but TLS features are not actively used.

I strongly suspect this is the same issue as discussed in this post on
the ÂClibc mailing list:

https://lists.osuosl.org/pipermail/uclibc/2009-December/043375.html

and therefore the patch provided mainly reuses the solution they've
found for ÂClibc.


Main development platform information (tested on multiple platforms):

* latest glibc git version (pulled at Tue JunÂÂ7 15:45:13 CEST 2016)
* x86_64 (but should be independent)
*ÂLinux TP_Trommler 4.5.4-1-ARCH #1 SMP PREEMPT Wed May 11 22:21:28
CEST 2016 x86_64 GNU/Linux
*Âgcc (GCC) 6.1.1 20160501
*ÂGNU ld (GNU Binutils) 2.26.0.20160501

The patch provided touches code already altered (but not released) by
the work on bug 19329.



2016-06-07 ÂPhilipp Trommler Â<philipp.trommler@tu-dresden.de>

	[BZ #19924]
	* elf/dl-close.c: Port changes from the ÂClibc to fix
	performance degradation after dl_open
	* elf/dl-open.c: Likewise
	* elf/dl-tls.c: Likewise

---
Âelf/dl-close.c |ÂÂ2 ++
Âelf/dl-open.cÂÂ| 10 ++++++++--
Âelf/dl-tls.cÂÂÂ| 35 ++++++++++++++++++++++-------------
Â3 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/elf/dl-close.c b/elf/dl-close.c
index 687d7de..f1d008f 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -32,6 +32,7 @@
Â#include <sysdep-cancel.h>
Â#include <tls.h>
Â#include <stap-probe.h>
+#include <atomic.h>
Â
Â#include <dl-unmap-segments.h>
Â
@@ -753,6 +754,7 @@ _dl_close_worker (struct link_map *map, bool force)
ÂÂÂ/* If we removed any object which uses TLS bump the generation
counter.ÂÂ*/
ÂÂÂif (any_tls)
ÂÂÂÂÂ{
+ÂÂÂÂÂÂatomic_write_barrier ();
ÂÂÂÂÂÂÂif (__glibc_unlikely (++GL(dl_tls_generation) == 0))
Â	_dl_fatal_printf ("TLS generation counter wrapped!ÂÂPlease
report as described in "REPORT_BUGS_TO".\n");
Â
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 6f178b3..e696c4e 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -524,9 +524,15 @@ dl_open_worker (void *a)
ÂÂÂÂÂ}
Â
ÂÂÂ/* Bump the generation number if necessary.ÂÂ*/
-ÂÂif (any_tls && __builtin_expect (++GL(dl_tls_generation) == 0, 0))
-ÂÂÂÂ_dl_fatal_printf (N_("\
+ÂÂif (any_tls)
+ÂÂÂÂ{
+ÂÂÂÂÂÂatomic_write_barrier ();
+ÂÂÂÂÂÂif (__builtin_expect (++GL(dl_tls_generation) == 0, 0))
+ÂÂÂÂÂÂÂÂ{
+ÂÂÂÂÂÂÂÂÂÂ_dl_fatal_printf (N_("\
ÂTLS generation counter wrapped!ÂÂPlease report this."));
+ÂÂÂÂÂÂÂÂ}
+ÂÂÂÂ}
Â
ÂÂÂ/* We need a second pass for static tls data, because
_dl_update_slotinfo
ÂÂÂÂÂÂmust not be run while calls to _dl_add_to_slotinfo are still
pending.ÂÂ*/
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index ed13fd9..7c2c5c7 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -629,12 +629,16 @@ _dl_update_slotinfo (unsigned long int req_modid)
ÂÂÂÂÂÂpossible that other threads at the same time dynamically load
ÂÂÂÂÂÂcode and therefore add to the slotinfo list.ÂÂThis is a problem
ÂÂÂÂÂÂsince we must not pick up any information about incomplete work.
-ÂÂÂÂÂThe solution to this is to ignore all dtv slots which were
-ÂÂÂÂÂcreated after the one we are currently interested.ÂÂWe know that
-ÂÂÂÂÂdynamic loading for this module is completed and this is the last
-ÂÂÂÂÂload operation we know finished.ÂÂ*/
+ÂÂÂÂÂThe solution to this is to ensure that we capture the current
+ÂÂÂÂÂgeneration count before walking the list, and ignore any slots
+ÂÂÂÂÂwhich are marked with a higher generation count, since they may
+ÂÂÂÂÂstill be in the process of being updated.ÂÂ*/
ÂÂÂunsigned long int idx = req_modid;
ÂÂÂstruct dtv_slotinfo_list *listp = GL(dl_tls_dtv_slotinfo_list);
+ÂÂsize_t new_gen = GL(dl_tls_generation);
+
+ÂÂ/* Make sure we don't read _dl_tls_generation too late. */
+ÂÂatomic_read_barrier();
Â
ÂÂÂwhile (idx >= listp->len)
ÂÂÂÂÂ{
@@ -642,13 +646,8 @@ _dl_update_slotinfo (unsigned long int req_modid)
ÂÂÂÂÂÂÂlistp = listp->next;
ÂÂÂÂÂ}
Â
-ÂÂif (dtv[0].counter < listp->slotinfo[idx].gen)
ÂÂÂÂÂ{
-ÂÂÂÂÂÂ/* The generation counter for the slot is higher than what the
-	Âcurrent dtv implements.ÂÂWe have to update the whole dtv but
-	Âonly those entries with a generation counter <= the one for
-	Âthe entry we need.ÂÂ*/
-ÂÂÂÂÂÂsize_t new_gen = listp->slotinfo[idx].gen;
+ÂÂÂÂÂÂ/* Tilera: leave scope to make diffs against our baseline
easier. */
ÂÂÂÂÂÂÂsize_t total = 0;
Â
ÂÂÂÂÂÂÂ/* We have to look through the entire dtv slotinfo list.ÂÂ*/
@@ -916,7 +915,7 @@ _dl_add_to_slotinfo (struct link_map *l)
Â	Âthe first slot.ÂÂ*/
ÂÂÂÂÂÂÂassert (idx == 0);
Â
-ÂÂÂÂÂÂlistp = prevp->next = (struct dtv_slotinfo_list *)
+ÂÂÂÂÂÂlistp = (struct dtv_slotinfo_list *)
Â	malloc (sizeof (struct dtv_slotinfo_list)
Â		+ TLS_SLOTINFO_SURPLUS * sizeof (struct
dtv_slotinfo));
ÂÂÂÂÂÂÂif (listp == NULL)
@@ -939,9 +938,19 @@ cannot create TLS data structures"));
ÂÂÂÂÂÂÂlistp->next = NULL;
ÂÂÂÂÂÂÂmemset (listp->slotinfo, '\0',
Â	ÂÂÂÂÂÂTLS_SLOTINFO_SURPLUS * sizeof (struct dtv_slotinfo));
+
+ÂÂÂÂÂÂ/* Make sure the new slotinfo element is in memory before
exposing it. */
+ÂÂÂÂÂÂatomic_write_barrier();
+ÂÂÂÂÂÂprevp->next = listp;
ÂÂÂÂÂ}
Â
-ÂÂ/* Add the information into the slotinfo data structure.ÂÂ*/
-ÂÂlistp->slotinfo[idx].map = l;
+ÂÂ/* Add the information into the slotinfo data structure.ÂÂMake sure
+ÂÂÂÂÂto barrier as we do it, to present a consistent view to other
+ÂÂÂÂÂthreads that may be calling __tls_get_addr() as we do this.
+ÂÂÂÂÂBarrier after the writes so we know everything is ready for an
+ÂÂÂÂÂincrement of _dl_tls_generation.ÂÂ*/
ÂÂÂlistp->slotinfo[idx].gen = GL(dl_tls_generation) + 1;
+ÂÂatomic_write_barrier();
+ÂÂlistp->slotinfo[idx].map = l;
+ÂÂatomic_write_barrier();
Â}
--Â
2.5.0
__thread int dummy_tls_for_lib = 23;

void dummy_access()
{
    dummy_tls_for_lib++;
}

default: tls_mini libdummy.so

CC = gcc
CFLAGS = -fpic -std=gnu11 -Wall -pedantic -Werror
LDFLAGS = -ldl

%.o: %.c
	$(CC) -c $< -o $@ $(CFLAGS)

libdummy.so: dummy.o
	$(CC) -shared dummy.o -o libdummy.so $(CFLAGS)

libtls_access.so: tls_access.o
	$(CC) -shared tls_access.o -o libtls_access.so $(CFLAGS)

tls_mini: tls_mini.o libtls_access.so
	$(CC) tls_mini.o -o $@ $(CFLAGS) -L. -ltls_access $(LDFLAGS)
extern __thread volatile long tls_data;

void tls_access()
{   
    tls_data += 2;
}

#include <stdio.h>
#include <dlfcn.h>
#include <stdlib.h>
#include <time.h>

__thread volatile long tls_data;

static inline double gtod()
{
    struct timespec tp;
    clock_gettime(CLOCK_REALTIME, &tp);
    return ((double)tp.tv_sec + tp.tv_nsec / 1.0e9);
}

void tls_access();

void tls_work()
{
    double start = gtod();
    for (long long i = 0; i < 100000000; i++) {
        tls_access();
    }
    double end = gtod();
    printf("time: %f\n", end-start);
}

int main()
{
    tls_work();

    void* handle = dlopen("./libdummy.so", RTLD_NOW);
    if (!handle) {
        fprintf(stderr, "%s\n", dlerror());
        exit(EXIT_FAILURE);
    }

    tls_work();

    void (*dummy_access)(void);
    *(void**)(&dummy_access) = dlsym(handle, "dummy_access");
    if (!dummy_access) {
        fprintf(stderr, "%s\n", dlerror());
        exit(EXIT_FAILURE);
    }
    
    dummy_access();
    
    tls_work();
    
    dlclose(handle);
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]