It looks like GL(dl_ns)[cnt]._ns_nloaded for cnt > 0 is accounted N times instead of once. Fix: diff --git a/elf/dl-iteratephdr.c b/elf/dl-iteratephdr.c index 88473e790b..bbbd8c1e8f 100644 --- a/elf/dl-iteratephdr.c +++ b/elf/dl-iteratephdr.c @@ -47,17 +47,16 @@ __dl_iterate_phdr (int (*callback) (struct dl_phdr_info *info, #ifdef SHARED const void *caller = RETURN_ADDRESS (0); for (Lmid_t cnt = GL(dl_nns) - 1; cnt > 0; --cnt) - for (struct link_map *l = GL(dl_ns)[cnt]._ns_loaded; l; l = l->l_next) - { - /* We have to count the total number of loaded objects. */ - nloaded += GL(dl_ns)[cnt]._ns_nloaded; - + { + /* We have to count the total number of loaded objects. */ + nloaded += GL(dl_ns)[cnt]._ns_nloaded; + for (struct link_map *l = GL(dl_ns)[cnt]._ns_loaded; l; l = l->l_next) if (caller >= (const void *) l->l_map_start && caller < (const void *) l->l_map_end && (l->l_contiguous || _dl_addr_inside_object (l, (ElfW(Addr)) caller))) ns = cnt; - } + } #endif for (l = GL(dl_ns)[ns]._ns_loaded; l != NULL; l = l->l_next)
Note that + /* There must be exactly one DSO for the range of the virtual + memory. Otherwise something is really broken. */ + ns = cnt; misses the obvious optimization of not needing to continue iteration, like the following. Though the comment was removed at some point. diff --git a/elf/dl-iteratephdr.c b/elf/dl-iteratephdr.c index 88473e790b..c33f6e14f2 100644 --- a/elf/dl-iteratephdr.c +++ b/elf/dl-iteratephdr.c @@ -47,17 +47,20 @@ __dl_iterate_phdr (int (*callback) (struct dl_phdr_info *info, #ifdef SHARED const void *caller = RETURN_ADDRESS (0); for (Lmid_t cnt = GL(dl_nns) - 1; cnt > 0; --cnt) - for (struct link_map *l = GL(dl_ns)[cnt]._ns_loaded; l; l = l->l_next) - { - /* We have to count the total number of loaded objects. */ - nloaded += GL(dl_ns)[cnt]._ns_nloaded; - - if (caller >= (const void *) l->l_map_start - && caller < (const void *) l->l_map_end - && (l->l_contiguous - || _dl_addr_inside_object (l, (ElfW(Addr)) caller))) - ns = cnt; - } + { + /* We have to count the total number of loaded objects. */ + nloaded += GL(dl_ns)[cnt]._ns_nloaded; + if (ns == 0) + for (struct link_map *l = GL(dl_ns)[cnt]._ns_loaded; l; l = l->l_next) + if (caller >= (const void *) l->l_map_start + && caller < (const void *) l->l_map_end + && (l->l_contiguous + || _dl_addr_inside_object (l, (ElfW(Addr)) caller))) + { + ns = cnt; + break; + } + } #endif for (l = GL(dl_ns)[ns]._ns_loaded; l != NULL; l = l->l_next)
Not iterating over ns == 0 will of course fail to terminate the iteration early. Computing nloaded all the time doesn't seem to have a big benefit of avoding the previously existing global for it either. So just have globals for dlpi_adds and dlpi_subs rather than being too clever here would be an improvement IMHO (and a TLS single-entry cache for the last used namespace would be possible).
Previously posted patch: https://sourceware.org/ml/libc-alpha/2016-07/msg00609.html https://sourceware.org/ml/libc-alpha/2016-08/msg00005.html