This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH RFC] introduce dl_iterate_phdr_parallel


Problem: exception handling is not scalable. Attached program mt.cc
demonstrates this easily. It runs around 1 seconds with 1 thread, but
with only 4 threads it takes 4.5 seconds to complete. The reason is
locks that are taken on unwind path.

There are two locks right now.

Fist is in libgcc's _Unwind_Find_registered_FDE and it is eliminated by
https://gcc.gnu.org/ml/gcc-patches/2016-07/msg01629.html (for dynamic
executables only, but this is common case).

Second one is dl_load_write_lock in __dl_iterate_phdr. It serves dual
purpose: it stops the list of loaded objects from been modified while
iterating over it and it makes sure that more than one callback will
not run in parallel. This means that even if we will find a cleaver way
to make __dl_iterate_phdr iterate over the objects list locklessly the
lock will still have to be taken around the call to callback to preserve
second guaranty.

This patch here propose to introduce another API: dl_iterate_phdr_parallel
which does exactly same thing as dl_iterate_phdr but may run more than
one provided callback in parallel. And to make it more scalable it
breaks single dl_load_write_lock into arrays of locks. Reader takes only
one lock, but writer has to lock all of them to proceed with modifying
the list.

Attached gcc.patch makes gcc use proposed dl_iterate_phdr_parallel
interface and with this modification (and the patch linked above) same
mt.cc with 4 threads runs around 1 second.

diff --git a/elf/Versions b/elf/Versions
index 23deda9..56b7510 100644
--- a/elf/Versions
+++ b/elf/Versions
@@ -11,6 +11,9 @@ libc {
   GLIBC_2.2.4 {
     dl_iterate_phdr;
   }
+  GLIBC_2.23 {
+    dl_iterate_phdr_parallel;
+  }
 %ifdef EXPORT_UNWIND_FIND_FDE
   # Needed for SHLIB_COMPAT calls using this version.
   GLIBC_2.2.5 {
diff --git a/elf/dl-close.c b/elf/dl-close.c
index 687d7de..9b8e9ac 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -535,7 +535,8 @@ _dl_close_worker (struct link_map *map, bool force)
   tls_free_start = tls_free_end = NO_TLS_OFFSET;
 
   /* We modify the list of loaded objects.  */
-  __rtld_lock_lock_recursive (GL(dl_load_write_lock));
+  for (int i = 0; i < _DL_LOCKS; i++)
+    __rtld_lock_lock_recursive (GL(dl_load_write_lock)[i]);
 
   /* Check each element of the search list to see if all references to
      it are gone.  */
@@ -748,7 +749,8 @@ _dl_close_worker (struct link_map *map, bool force)
 	}
     }
 
-  __rtld_lock_unlock_recursive (GL(dl_load_write_lock));
+  for (int i = 0; i < _DL_LOCKS; i++)
+    __rtld_lock_unlock_recursive (GL(dl_load_write_lock)[i]);
 
   /* If we removed any object which uses TLS bump the generation counter.  */
   if (any_tls)
diff --git a/elf/dl-iteratephdr.c b/elf/dl-iteratephdr.c
index 4db6938..69e4a79 100644
--- a/elf/dl-iteratephdr.c
+++ b/elf/dl-iteratephdr.c
@@ -22,24 +22,47 @@
 #include <stddef.h>
 #include <libc-lock.h>
 
+static
+uintptr_t crank(uintptr_t word)
+{ 
+  word ^= (word << 13); word ^= (word >> 17); word ^= (word << 5); 
+  word = 69069*word + 12345;
+  return word;
+}
+
+static
+uintptr_t hash(uintptr_t val)
+{
+  char* x = (char*)&val;
+  uintptr_t r = 0;
+  r = x[0];
+  for (unsigned i = 1; i < sizeof(val); ++i)
+    {
+      r = crank(r) + x[i];
+      r = crank(r);
+    }
+  return r & (_DL_LOCKS - 1);
+}
+
+
 static void
-cancel_handler (void *arg __attribute__((unused)))
+cancel_handler (void *arg)
 {
-  __rtld_lock_unlock_recursive (GL(dl_load_write_lock));
+  __rtld_lock_unlock_recursive (GL(dl_load_write_lock)[(uintptr_t)arg]);
 }
 
-hidden_proto (__dl_iterate_phdr)
+static
 int
-__dl_iterate_phdr (int (*callback) (struct dl_phdr_info *info,
-				    size_t size, void *data), void *data)
+__dl_iterate_phdr_internal (int (*callback) (struct dl_phdr_info *info,
+				    size_t size, void *data), void *data, unsigned idx)
 {
   struct link_map *l;
   struct dl_phdr_info info;
   int ret = 0;
 
   /* Make sure nobody modifies the list of loaded objects.  */
-  __rtld_lock_lock_recursive (GL(dl_load_write_lock));
-  __libc_cleanup_push (cancel_handler, NULL);
+  __rtld_lock_lock_recursive (GL(dl_load_write_lock)[idx]);
+  __libc_cleanup_push (cancel_handler, (void*)(uintptr_t)idx);
 
   /* We have to determine the namespace of the caller since this determines
      which namespace is reported.  */
@@ -80,10 +103,26 @@ __dl_iterate_phdr (int (*callback) (struct dl_phdr_info *info,
 
   /* Release the lock.  */
   __libc_cleanup_pop (0);
-  __rtld_lock_unlock_recursive (GL(dl_load_write_lock));
+  __rtld_lock_unlock_recursive (GL(dl_load_write_lock)[idx]);
 
   return ret;
 }
+
+hidden_proto (__dl_iterate_phdr)
+int
+__dl_iterate_phdr (int (*callback) (struct dl_phdr_info *info,
+				    size_t size, void *data), void *data) {
+  return __dl_iterate_phdr_internal(callback, data, 0);
+}
 hidden_def (__dl_iterate_phdr)
 
+hidden_proto (__dl_iterate_phdr_parallel)
+int
+__dl_iterate_phdr_parallel (int (*callback) (struct dl_phdr_info *info,
+				    size_t size, void *data), void *data) {
+  return __dl_iterate_phdr_internal(callback, data, hash((unsigned long)THREAD_SELF));
+}
+hidden_def (__dl_iterate_phdr_parallel)
+
 weak_alias (__dl_iterate_phdr, dl_iterate_phdr);
+weak_alias (__dl_iterate_phdr_parallel, dl_iterate_phdr_parallel);
diff --git a/elf/dl-object.c b/elf/dl-object.c
index 362992b..a8df503 100644
--- a/elf/dl-object.c
+++ b/elf/dl-object.c
@@ -31,7 +31,8 @@ internal_function
 _dl_add_to_namespace_list (struct link_map *new, Lmid_t nsid)
 {
   /* We modify the list of loaded objects.  */
-  __rtld_lock_lock_recursive (GL(dl_load_write_lock));
+  for (int i = 0; i < _DL_LOCKS; i++)
+    __rtld_lock_lock_recursive (GL(dl_load_write_lock)[i]);
 
   if (GL(dl_ns)[nsid]._ns_loaded != NULL)
     {
@@ -48,7 +49,8 @@ _dl_add_to_namespace_list (struct link_map *new, Lmid_t nsid)
   new->l_serial = GL(dl_load_adds);
   ++GL(dl_load_adds);
 
-  __rtld_lock_unlock_recursive (GL(dl_load_write_lock));
+  for (int i = 0; i < _DL_LOCKS; i++)
+    __rtld_lock_unlock_recursive (GL(dl_load_write_lock)[i]);
 }
 
 
diff --git a/elf/dl-support.c b/elf/dl-support.c
index c30194c..0fc12e4 100644
--- a/elf/dl-support.c
+++ b/elf/dl-support.c
@@ -213,7 +213,7 @@ __rtld_lock_define_initialized_recursive (, _dl_load_lock)
 /* This lock is used to keep __dl_iterate_phdr from inspecting the
    list of loaded objects while an object is added to or removed from
    that list.  */
-__rtld_lock_define_initialized_recursive (, _dl_load_write_lock)
+__rtld_lock_recursive_t _dl_load_write_lock[_DL_LOCKS] = { _RTLD_LOCK_RECURSIVE_INITIALIZER, };
 
 
 #ifdef HAVE_AUX_VECTOR
diff --git a/elf/link.h b/elf/link.h
index f448141..56fd4b5 100644
--- a/elf/link.h
+++ b/elf/link.h
@@ -168,6 +168,9 @@ extern int dl_iterate_phdr (int (*__callback) (struct dl_phdr_info *,
 					       size_t, void *),
 			    void *__data);
 
+extern int dl_iterate_phdr_parallel (int (*__callback) (struct dl_phdr_info *,
+					       size_t, void *),
+			    void *__data);
 
 /* Prototypes for the ld.so auditing interfaces.  These are not
    defined anywhere in ld.so but instead have to be provided by the
diff --git a/elf/rtld.c b/elf/rtld.c
index 647661c..9c65b03 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -128,7 +128,7 @@ struct rtld_global _rtld_global =
     ._dl_stack_flags = DEFAULT_STACK_PERMS,
 #ifdef _LIBC_REENTRANT
     ._dl_load_lock = _RTLD_LOCK_RECURSIVE_INITIALIZER,
-    ._dl_load_write_lock = _RTLD_LOCK_RECURSIVE_INITIALIZER,
+    ._dl_load_write_lock = { _RTLD_LOCK_RECURSIVE_INITIALIZER, },
 #endif
     ._dl_nns = 1,
     ._dl_ns =
diff --git a/include/link.h b/include/link.h
index 32a7392..9582ed9 100644
--- a/include/link.h
+++ b/include/link.h
@@ -336,6 +336,10 @@ extern int __dl_iterate_phdr (int (*callback) (struct dl_phdr_info *info,
 					       size_t size, void *data),
 			      void *data);
 
+extern int __dl_iterate_phdr_parallel (int (*callback) (struct dl_phdr_info *info,
+					       size_t size, void *data),
+			      void *data);
+
 /* We use this macro to refer to ELF macros independent of the native
    wordsize.  `ELFW(R_TYPE)' is used in place of `ELF32_R_TYPE' or
    `ELF64_R_TYPE'.  */
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index f68fdf4..0d2b2a1 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -268,6 +268,8 @@ typedef void (*receiver_fct) (int, const char *, const char *);
    user interface to run-time dynamic linking.  */
 
 
+#define _DL_LOCKS 64 
+
 #ifndef SHARED
 # define EXTERN extern
 # define GL(name) _##name
@@ -334,7 +336,7 @@ struct rtld_global
   /* This lock is used to keep __dl_iterate_phdr from inspecting the
      list of loaded objects while an object is added to or removed
      from that list.  */
-  __rtld_lock_define_recursive (EXTERN, _dl_load_write_lock)
+  __rtld_lock_define_recursive (EXTERN, _dl_load_write_lock[_DL_LOCKS])
 
   /* Incremented whenever something may have been added to dl_loaded.  */
   EXTERN unsigned long long _dl_load_adds;
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
index c1054ce..ea9d2e9 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
@@ -82,6 +82,7 @@ GLIBC_2.17 clock_settime F
 GLIBC_2.17 secure_getenv F
 GLIBC_2.18 GLIBC_2.18 A
 GLIBC_2.18 __cxa_thread_atexit_impl F
+GLIBC_2.23 dl_iterate_phdr_parallel F
 GLIBC_2.2.5 GLIBC_2.2.5 A
 GLIBC_2.2.5 _Exit F
 GLIBC_2.2.5 _IO_2_1_stderr_ D 0xe0

--
			Gleb.

Attachment: mt.cc
Description: Text document

Attachment: gcc.patch
Description: Text document


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]