Bug 26104 - New forked process __reclaim_stacks endless loop
Summary: New forked process __reclaim_stacks endless loop
Status: NEW
Alias: None
Product: glibc
Classification: Unclassified
Component: nptl (show other bugs)
Version: unspecified
: P2 normal
Target Milestone: ---
Assignee: Not yet assigned to anyone
URL:
Keywords:
: 17326 (view as bug list)
Depends on:
Blocks:
 
Reported: 2020-06-10 11:20 UTC by buque
Modified: 2020-06-12 02:39 UTC (History)
3 users (show)

See Also:
Host:
Target:
Build:
Last reconfirmed: 2020-06-11 00:00:00


Attachments

Note You need to log in before you can comment on or make changes to this bug.
Description buque 2020-06-10 11:20:15 UTC
Hello, there is a endless loop in __reclaim_stacks(traverse stack_cache double-link list):

code:
  /* Reset the PIDs in any cached stacks.  */
  list_for_each (runp, &stack_cache)
    {
      struct pthread *curp = list_entry (runp, struct pthread, list);
      curp->pid = self->pid;
    }

call stack:
	root@cn-north-4b-CloudDataCompassSurfer-010077236019 ~]# gstack 72000
	#0  0x00007fd93f0ff3ad in __reclaim_stacks () from /lib64/libpthread.so.0
	#1  0x00007fd93ebecebe in fork () from /lib64/libc.so.6
	#2  0x000056016a453d1e in monitor_popen ()
	#3  0x000056016a4590fd in ?? ()
	#4  0x000056016a459285 in ?? ()
	#5  0x00007fd93f0ffdf5 in start_thread () from /lib64/libpthread.so.0
	#6  0x00007fd93ec2648d in clone () from /lib64/libc.so.6


when cache_stack has only one node, but cache_stack->next pointer is error; I think it's a address freed by fathor process, because it's a normal address and damaged with 0x11940(72000 tid of son process) and the pointer not in both maps

(gdb) p stack_cache
$1 = {next = 0x11940177fe9c0, prev = 0x7fd9177fe9c0}
(gdb) p stack_cache.next
$2 = (struct list_head *) 0x11940177fe9c0
(gdb) p stack_cache.next->next
Cannot access memory at address 0x11940177fe9c0   //0x11940177fe9c0(0x7fd9177fe9c0)


[root@cn-north-4b-CloudDataCompassSurfer-010077236019 ~]# ps -efl | grep sysmo
1 S root       5530      1  0  80   0 - 286005 hrtime Apr11 ?       00:35:27 /usr/bin/sysmonitor --daemon
0 S root       5834   5530  0  80   0 - 32480 poll_s Apr11 ?        00:00:00 python /usr/libexec/sysmonitor/clocktransition.py
1 R root      31897   5530 99  80   0 - 286005 -     May07 ?        24-23:45:49 /usr/bin/sysmonitor --daemon
1 R root      72000   5530 99  80   0 - 286005 -     May04 ?        27-15:09:41 /usr/bin/sysmonitor --daemon
0 S root     121908 121213  0  80   0 - 28182 pipe_w 13:16 pts/0    00:00:00 grep --color=auto sysmo
[root@cn-north-4b-CloudDataCompassSurfer-010077236019 ~]# cat /proc/5530/maps
56016a44f000-56016a46c000 r-xp 00000000 fd:00 277411                     /usr/bin/sysmonitor
56016a66b000-56016a66c000 r--p 0001c000 fd:00 277411                     /usr/bin/sysmonitor
56016a66c000-56016a66e000 rw-p 0001d000 fd:00 277411                     /usr/bin/sysmonitor
56016a66e000-56016a6ed000 rw-p 00000000 00:00 0
56016b0ff000-56016b120000 rw-p 00000000 00:00 0                          [heap]
7fd8f0000000-7fd8f0021000 rw-p 00000000 00:00 0
7fd8f0021000-7fd8f4000000 ---p 00000000 00:00 0
7fd8f8000000-7fd8f8021000 rw-p 00000000 00:00 0
7fd8f8021000-7fd8fc000000 ---p 00000000 00:00 0
7fd8fc000000-7fd8fc021000 rw-p 00000000 00:00 0
7fd8fc021000-7fd900000000 ---p 00000000 00:00 0
7fd900000000-7fd900021000 rw-p 00000000 00:00 0
7fd900021000-7fd904000000 ---p 00000000 00:00 0
7fd904000000-7fd904021000 rw-p 00000000 00:00 0
7fd904021000-7fd908000000 ---p 00000000 00:00 0
7fd908000000-7fd908021000 rw-p 00000000 00:00 0
7fd908021000-7fd90c000000 ---p 00000000 00:00 0
7fd90effe000-7fd90efff000 ---p 00000000 00:00 0
7fd90efff000-7fd90f7ff000 rw-p 00000000 00:00 0
7fd910000000-7fd910021000 rw-p 00000000 00:00 0
7fd910021000-7fd914000000 ---p 00000000 00:00 0
7fd9157fb000-7fd9157fc000 ---p 00000000 00:00 0
7fd9157fc000-7fd915ffc000 rw-p 00000000 00:00 0
7fd915ffc000-7fd915ffd000 ---p 00000000 00:00 0
7fd915ffd000-7fd9167fd000 rw-p 00000000 00:00 0
7fd9167fd000-7fd9167fe000 ---p 00000000 00:00 0
7fd9167fe000-7fd916ffe000 rw-p 00000000 00:00 0
7fd916ffe000-7fd916fff000 ---p 00000000 00:00 0
7fd916fff000-7fd9177ff000 rw-p 00000000 00:00 0
7fd9177ff000-7fd917800000 ---p 00000000 00:00 0
7fd917800000-7fd918000000 rw-p 00000000 00:00 0
7fd918000000-7fd918021000 rw-p 00000000 00:00 0
7fd918021000-7fd91c000000 ---p 00000000 00:00 0
7fd91c000000-7fd91c021000 rw-p 00000000 00:00 0
7fd91c021000-7fd920000000 ---p 00000000 00:00 0
7fd920000000-7fd920021000 rw-p 00000000 00:00 0
7fd920021000-7fd924000000 ---p 00000000 00:00 0
7fd924000000-7fd924021000 rw-p 00000000 00:00 0
7fd924021000-7fd928000000 ---p 00000000 00:00 0
7fd928000000-7fd928021000 rw-p 00000000 00:00 0
7fd928021000-7fd92c000000 ---p 00000000 00:00 0
7fd92c000000-7fd92c021000 rw-p 00000000 00:00 0
7fd92c021000-7fd930000000 ---p 00000000 00:00 0
7fd930000000-7fd930021000 rw-p 00000000 00:00 0
7fd930021000-7fd934000000 ---p 00000000 00:00 0
7fd9347f9000-7fd9347fa000 ---p 00000000 00:00 0
7fd9347fa000-7fd934ffa000 rw-p 00000000 00:00 0
7fd934ffa000-7fd934ffb000 ---p 00000000 00:00 0
7fd934ffb000-7fd9357fb000 rw-p 00000000 00:00 0
7fd9357fb000-7fd9357fc000 ---p 00000000 00:00 0
7fd9357fc000-7fd935ffc000 rw-p 00000000 00:00 0
7fd935ffc000-7fd935ffd000 ---p 00000000 00:00 0
7fd935ffd000-7fd9367fd000 rw-p 00000000 00:00 0
7fd9367fd000-7fd9367fe000 ---p 00000000 00:00 0
7fd9367fe000-7fd936ffe000 rw-p 00000000 00:00 0
7fd936ffe000-7fd936fff000 ---p 00000000 00:00 0
7fd936fff000-7fd9377ff000 rw-p 00000000 00:00 0
7fd9377ff000-7fd937800000 ---p 00000000 00:00 0
7fd937800000-7fd938000000 rw-p 00000000 00:00 0
7fd938000000-7fd938021000 rw-p 00000000 00:00 0
7fd938021000-7fd93c000000 ---p 00000000 00:00 0
7fd93c10c000-7fd93c118000 r-xp 00000000 fd:00 266862                     /usr/lib64/libnss_files-2.17.so
7fd93c118000-7fd93c317000 ---p 0000c000 fd:00 266862                     /usr/lib64/libnss_files-2.17.so
7fd93c317000-7fd93c318000 r--p 0000b000 fd:00 266862                     /usr/lib64/libnss_files-2.17.so
7fd93c318000-7fd93c31f000 rw-p 00000000 00:00 0
7fd93c31f000-7fd93c320000 ---p 00000000 00:00 0
7fd93c320000-7fd93cb20000 rw-p 00000000 00:00 0
7fd93cb20000-7fd93cb21000 ---p 00000000 00:00 0
7fd93cb21000-7fd93d321000 rw-p 00000000 00:00 0
7fd93d321000-7fd93d322000 ---p 00000000 00:00 0
7fd93d322000-7fd93db22000 rw-p 00000000 00:00 0
7fd93db22000-7fd93db23000 ---p 00000000 00:00 0
7fd93db23000-7fd93e323000 rw-p 00000000 00:00 0
7fd93e323000-7fd93e324000 ---p 00000000 00:00 0
7fd93e324000-7fd93eb24000 rw-p 00000000 00:00 0
7fd93eb24000-7fd93eceb000 r-xp 00000000 fd:00 266844                     /usr/lib64/libc-2.17.so
7fd93eceb000-7fd93eeea000 ---p 001c7000 fd:00 266844                     /usr/lib64/libc-2.17.so
7fd93eeea000-7fd93eeee000 r--p 001c6000 fd:00 266844                     /usr/lib64/libc-2.17.so
7fd93eeee000-7fd93eef0000 rw-p 001ca000 fd:00 266844                     /usr/lib64/libc-2.17.so
7fd93eef0000-7fd93eef5000 rw-p 00000000 00:00 0
7fd93eef5000-7fd93eef7000 r-xp 00000000 fd:00 272436                     /usr/lib64/libalarm.so
7fd93eef7000-7fd93f0f6000 ---p 00002000 fd:00 272436                     /usr/lib64/libalarm.so
7fd93f0f6000-7fd93f0f7000 r--p 00001000 fd:00 272436                     /usr/lib64/libalarm.so
7fd93f0f7000-7fd93f0f8000 rw-p 00002000 fd:00 272436                     /usr/lib64/libalarm.so
7fd93f0f8000-7fd93f10f000 r-xp 00000000 fd:00 266870                     /usr/lib64/libpthread-2.17.so
7fd93f10f000-7fd93f30e000 ---p 00017000 fd:00 266870                     /usr/lib64/libpthread-2.17.so
7fd93f30e000-7fd93f30f000 r--p 00016000 fd:00 266870                     /usr/lib64/libpthread-2.17.so
7fd93f30f000-7fd93f310000 rw-p 00017000 fd:00 266870                     /usr/lib64/libpthread-2.17.so
7fd93f310000-7fd93f314000 rw-p 00000000 00:00 0
7fd93f314000-7fd93f325000 r-xp 00000000 fd:00 262290                     /usr/lib64/libsecurec.so
7fd93f325000-7fd93f525000 ---p 00011000 fd:00 262290                     /usr/lib64/libsecurec.so
7fd93f525000-7fd93f526000 r--p 00011000 fd:00 262290                     /usr/lib64/libsecurec.so
7fd93f526000-7fd93f527000 rw-p 00012000 fd:00 262290                     /usr/lib64/libsecurec.so
7fd93f527000-7fd93f549000 r-xp 00000000 fd:00 266837                     /usr/lib64/ld-2.17.so
7fd93f739000-7fd93f73d000 rw-p 00000000 00:00 0
7fd93f747000-7fd93f748000 rw-p 00000000 00:00 0
7fd93f748000-7fd93f749000 r--p 00021000 fd:00 266837                     /usr/lib64/ld-2.17.so
7fd93f749000-7fd93f74a000 rw-p 00022000 fd:00 266837                     /usr/lib64/ld-2.17.so
7fd93f74a000-7fd93f74b000 rw-p 00000000 00:00 0
7ffd02751000-7ffd02772000 rw-p 00000000 00:00 0                          [stack]
7ffd02780000-7ffd02782000 r-xp 00000000 00:00 0                          [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
[root@cn-north-4b-CloudDataCompassSurfer-010077236019 ~]# cat /proc/72000/maps
56016a44f000-56016a46c000 r-xp 00000000 fd:00 277411                     /usr/bin/sysmonitor
56016a66b000-56016a66c000 r--p 0001c000 fd:00 277411                     /usr/bin/sysmonitor
56016a66c000-56016a66e000 rw-p 0001d000 fd:00 277411                     /usr/bin/sysmonitor
56016a66e000-56016a6ed000 rw-p 00000000 00:00 0
56016b0ff000-56016b120000 rw-p 00000000 00:00 0                          [heap]
7fd8f0000000-7fd8f0021000 rw-p 00000000 00:00 0
7fd8f0021000-7fd8f4000000 ---p 00000000 00:00 0
7fd8f8000000-7fd8f8021000 rw-p 00000000 00:00 0
7fd8f8021000-7fd8fc000000 ---p 00000000 00:00 0
7fd8fc000000-7fd8fc021000 rw-p 00000000 00:00 0
7fd8fc021000-7fd900000000 ---p 00000000 00:00 0
7fd900000000-7fd900021000 rw-p 00000000 00:00 0
7fd900021000-7fd904000000 ---p 00000000 00:00 0
7fd904000000-7fd904021000 rw-p 00000000 00:00 0
7fd904021000-7fd908000000 ---p 00000000 00:00 0
7fd908000000-7fd908021000 rw-p 00000000 00:00 0
7fd908021000-7fd90c000000 ---p 00000000 00:00 0
7fd90effe000-7fd90efff000 ---p 00000000 00:00 0
7fd90efff000-7fd90f7ff000 rw-p 00000000 00:00 0
7fd910000000-7fd910021000 rw-p 00000000 00:00 0
7fd910021000-7fd914000000 ---p 00000000 00:00 0
7fd9157fb000-7fd9157fc000 ---p 00000000 00:00 0
7fd9157fc000-7fd915ffc000 rw-p 00000000 00:00 0
7fd915ffc000-7fd915ffd000 ---p 00000000 00:00 0
7fd915ffd000-7fd9167fd000 rw-p 00000000 00:00 0
7fd9167fd000-7fd9167fe000 ---p 00000000 00:00 0
7fd9167fe000-7fd916ffe000 rw-p 00000000 00:00 0
7fd916ffe000-7fd916fff000 ---p 00000000 00:00 0
7fd916fff000-7fd9177ff000 rw-p 00000000 00:00 0
7fd9177ff000-7fd917800000 ---p 00000000 00:00 0
7fd917800000-7fd918000000 rw-p 00000000 00:00 0
7fd918000000-7fd918021000 rw-p 00000000 00:00 0
7fd918021000-7fd91c000000 ---p 00000000 00:00 0
7fd91c000000-7fd91c021000 rw-p 00000000 00:00 0
7fd91c021000-7fd920000000 ---p 00000000 00:00 0
7fd920000000-7fd920021000 rw-p 00000000 00:00 0
7fd920021000-7fd924000000 ---p 00000000 00:00 0
7fd924000000-7fd924021000 rw-p 00000000 00:00 0
7fd924021000-7fd928000000 ---p 00000000 00:00 0
7fd928000000-7fd928021000 rw-p 00000000 00:00 0
7fd928021000-7fd92c000000 ---p 00000000 00:00 0
7fd92c000000-7fd92c021000 rw-p 00000000 00:00 0
7fd92c021000-7fd930000000 ---p 00000000 00:00 0
7fd930000000-7fd930021000 rw-p 00000000 00:00 0
7fd930021000-7fd934000000 ---p 00000000 00:00 0
7fd9347f9000-7fd9347fa000 ---p 00000000 00:00 0
7fd9347fa000-7fd934ffa000 rw-p 00000000 00:00 0
7fd934ffa000-7fd934ffb000 ---p 00000000 00:00 0
7fd934ffb000-7fd9357fb000 rw-p 00000000 00:00 0
7fd9357fb000-7fd9357fc000 ---p 00000000 00:00 0
7fd9357fc000-7fd935ffc000 rw-p 00000000 00:00 0
7fd935ffc000-7fd935ffd000 ---p 00000000 00:00 0
7fd935ffd000-7fd9367fd000 rw-p 00000000 00:00 0
7fd9367fd000-7fd9367fe000 ---p 00000000 00:00 0
7fd9367fe000-7fd936ffe000 rw-p 00000000 00:00 0
7fd936ffe000-7fd936fff000 ---p 00000000 00:00 0
7fd936fff000-7fd9377ff000 rw-p 00000000 00:00 0
7fd9377ff000-7fd937800000 ---p 00000000 00:00 0
7fd937800000-7fd938000000 rw-p 00000000 00:00 0
7fd938000000-7fd938021000 rw-p 00000000 00:00 0
7fd938021000-7fd93c000000 ---p 00000000 00:00 0
7fd93c10c000-7fd93c118000 r-xp 00000000 fd:00 266862                     /usr/lib64/libnss_files-2.17.so
7fd93c118000-7fd93c317000 ---p 0000c000 fd:00 266862                     /usr/lib64/libnss_files-2.17.so
7fd93c317000-7fd93c318000 r--p 0000b000 fd:00 266862                     /usr/lib64/libnss_files-2.17.so
7fd93c318000-7fd93c31f000 rw-p 00000000 00:00 0
7fd93c31f000-7fd93c320000 ---p 00000000 00:00 0
7fd93c320000-7fd93cb20000 rw-p 00000000 00:00 0
7fd93cb20000-7fd93cb21000 ---p 00000000 00:00 0
7fd93cb21000-7fd93d321000 rw-p 00000000 00:00 0
7fd93d321000-7fd93d322000 ---p 00000000 00:00 0
7fd93d322000-7fd93db22000 rw-p 00000000 00:00 0
7fd93db22000-7fd93db23000 ---p 00000000 00:00 0
7fd93db23000-7fd93e323000 rw-p 00000000 00:00 0
7fd93e323000-7fd93e324000 ---p 00000000 00:00 0
7fd93e324000-7fd93eb24000 rw-p 00000000 00:00 0
7fd93eb24000-7fd93eceb000 r-xp 00000000 fd:00 266844                     /usr/lib64/libc-2.17.so
7fd93eceb000-7fd93eeea000 ---p 001c7000 fd:00 266844                     /usr/lib64/libc-2.17.so
7fd93eeea000-7fd93eeee000 r--p 001c6000 fd:00 266844                     /usr/lib64/libc-2.17.so
7fd93eeee000-7fd93eef0000 rw-p 001ca000 fd:00 266844                     /usr/lib64/libc-2.17.so
7fd93eef0000-7fd93eef5000 rw-p 00000000 00:00 0
7fd93eef5000-7fd93eef7000 r-xp 00000000 fd:00 272436                     /usr/lib64/libalarm.so
7fd93eef7000-7fd93f0f6000 ---p 00002000 fd:00 272436                     /usr/lib64/libalarm.so
7fd93f0f6000-7fd93f0f7000 r--p 00001000 fd:00 272436                     /usr/lib64/libalarm.so
7fd93f0f7000-7fd93f0f8000 rw-p 00002000 fd:00 272436                     /usr/lib64/libalarm.so
7fd93f0f8000-7fd93f10f000 r-xp 00000000 fd:00 266870                     /usr/lib64/libpthread-2.17.so
7fd93f10f000-7fd93f30e000 ---p 00017000 fd:00 266870                     /usr/lib64/libpthread-2.17.so
7fd93f30e000-7fd93f30f000 r--p 00016000 fd:00 266870                     /usr/lib64/libpthread-2.17.so
7fd93f30f000-7fd93f310000 rw-p 00017000 fd:00 266870                     /usr/lib64/libpthread-2.17.so
7fd93f310000-7fd93f314000 rw-p 00000000 00:00 0
7fd93f314000-7fd93f325000 r-xp 00000000 fd:00 262290                     /usr/lib64/libsecurec.so
7fd93f325000-7fd93f525000 ---p 00011000 fd:00 262290                     /usr/lib64/libsecurec.so
7fd93f525000-7fd93f526000 r--p 00011000 fd:00 262290                     /usr/lib64/libsecurec.so
7fd93f526000-7fd93f527000 rw-p 00012000 fd:00 262290                     /usr/lib64/libsecurec.so
7fd93f527000-7fd93f549000 r-xp 00000000 fd:00 266837                     /usr/lib64/ld-2.17.so
7fd93f739000-7fd93f73d000 rw-p 00000000 00:00 0
7fd93f747000-7fd93f748000 rw-p 00000000 00:00 0
7fd93f748000-7fd93f749000 r--p 00021000 fd:00 266837                     /usr/lib64/ld-2.17.so
7fd93f749000-7fd93f74a000 rw-p 00022000 fd:00 266837                     /usr/lib64/ld-2.17.so
7fd93f74a000-7fd93f74b000 rw-p 00000000 00:00 0
7ffd02751000-7ffd02772000 rw-p 00000000 00:00 0                          [stack]
7ffd02780000-7ffd02782000 r-xp 00000000 00:00 0                          [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
[root@cn-north-4b-CloudDataCompassSurfer-010077236019 ~]#

and I found the same issue below:
https://sourceware.org/bugzilla/show_bug.cgi?id=17326

When process A is forking B, thread a of A has being destroy. stack_cache\in_flight_stack are in different vma and physical pages. Copy-on-write first lock stack_cache's page table, then lock in_flight_stack's page table.

I think put in_flight_stack and stack_cache in the same page, or lock stack_cache before fork could to solve this issue.

(gdb) p &in_flight_stack
$2 = (uintptr_t *) 0x7fd93f3130b0 <in_flight_stack>
(gdb) p &stack_cache
$3 = (list_t *) 0x7fd93f30f020 <stack_cache>
(gdb) p 0x7fd93f3130b0-0x7fd93f30f020
$4 = 16528
Comment 1 Carlos O'Donell 2020-06-11 17:57:46 UTC
I believe this is a duplicate of bug 17326.

What version of glibc are you using?

It doesn't seem like you've shown proof of an endless loop. Your example seems to show that the list simply terminates in unmapped memory which would cause a crash. The next addres e.g. 0x11940177fe9c0 looks corrupted. Do you see an endless loop or a crash?

In order to end up in an endless loop you have to have a non-head entry in the cache that points to reused memory that happens to contain values that create an endless loop in __reclaim_stacks().

In order to accomplish this the forked process must observe:

* An incomplete transition of the list entry, with the next pointer pointing to a mapping that is being used for other purposes and contains data that causes the circular list.

* A thread in the parent must have unlinked the entry via free_stacks, must have unmapped the memory, must have remapped something else in that VMA which contains data that causes the circular list.

I'm not sure how that could happen. If you can write out your analysis that would help.

Process A
- Thread B
  - calls pthread_join
- Thread A exits and is joined
  - __free_tcb()
    - __deallocate_stack()
      - queue_stack()
        - stack cache is full
        - free_stacks (walk list backwards via ->prev)
          - stack_list_del
          - in_flight_stack = elem
          - atomic write barrier
          - list_del(elem)

 52 /* Remove element from list.  */
 53 static inline void
 54 list_del (list_t *elem)
 55 {
 56   elem->next->prev = elem->prev;
 57   elem->prev->next = elem->next;
 58 }

Process B
- Forked.
- COW copies page containing stack_cache head, observing none of the updates from line 56, and 57 because they have not been flushed yet.
- COW copies page that contains the reused memory that used to be a struct pthread at the head's next element.
- __reclaim_stacks()
  - Loops endlessly clearing the same memory.

This looks like a real bug, but the probability of this seems low. It is still a real problem that should be solved.

I think the list manipulation needs to be capable of being asynchronously interrupted by the fork and that needs to be taken into consideration.

In bug 17326 we make the locking more complex, and I think that's a bad idea. This needs to work in a lock-free manner.
Comment 2 Carlos O'Donell 2020-06-11 17:58:39 UTC
*** Bug 17326 has been marked as a duplicate of this bug. ***
Comment 3 buque 2020-06-12 02:38:24 UTC
    Hi, your analysis is exactly what I think.
    
    We install a device with glibc 2.17(centos 7.5), process B deadloop in line 2-6, cpu core is 100%. 
    It's seems like crash at first, the bad address 0x11940177fe9c0 is 0x7fd9177fe9c0,line 5 damaged this pointer later(self-pid=72000)。It's amazing that not crash, I guss there is a ring.
    As your says, it is very hard to reproduce the problem. glibc2.17 had used sereval years and only one time, I will reproduce with white box test in a few days.
    I think it's hard to solve this bug with lock-free manner, it can't stop reading and writting, this bring intermediate state. Maybe you have a better way.


(gdb) p 0x11940
$1 = 72000

1  /* Reset the PIDs in any cached stacks.  */
2  list_for_each (runp, &stack_cache)
3    {
4      struct pthread *curp = list_entry (runp, struct pthread, list);
5      curp->pid = self->pid;
6    }

Detaching from program: /usr/bin/sysmonitor, process 72000
[root@cn-north-4b-CloudDataCompassSurfer-010077236019 ~]#

(gdb) info r
rax            0x7fd9167fc9c0   140570362104256
rbx            0x7fd93f30f010   140571044802576
rcx            0x7fd9177fe9c0   140570378889664
rdx            0x11940  72000
rsi            0x7fd915ffb9c0   140570353711552
rdi            0x7fd93eeef5c0   140571040478656
rbp            0x7fd93f30f020   0x7fd93f30f020 <stack_cache>
rsp            0x7fd9357f9608   0x7fd9357f9608
r8             0x7fd93f30f010   140571044802576
r9             0x159a   5530
r10            0x7fd93eb23700   140571036497664
r11            0x7fd9357fa700   140570882189056
r12            0x0      0
r13            0x0      0
r14            0x7fd93f749000   140571049234432
r15            0x7fd9357f99e0   140570882185696
rip            0x7fd93f0ff3aa   0x7fd93f0ff3aa <__reclaim_stacks+538>
eflags         0x287    [ CF PF SF IF ]
cs             0x33     51
ss             0x2b     43
ds             0x0      0
es             0x0      0
fs             0x0      0
gs             0x0      0
(gdb) n
900           curp->pid = self->pid;
(gdb) p curp
$3 = (struct pthread *) 0x7fd916ffd700
(gdb) p stack_cache
$4 = {next = 0x11940177fe9c0, prev = 0x7fd9177fe9c0}
(gdb) p stack_cache.prev
$5 = (struct list_head *) 0x7fd9177fe9c0
(gdb) p stack_cache.prev->prev
$6 = (struct list_head *) 0x7fd93f30f020 <stack_cache>
(gdb) p curp
$7 = (struct pthread *) 0x7fd916ffd700
(gdb) i r
rax            0x7fd916ffd9c0   140570370496960
rbx            0x7fd93f30f010   140571044802576
rcx            0x7fd9177fe9c0   140570378889664
rdx            0x11940  72000
rsi            0x7fd915ffb9c0   140570353711552
rdi            0x7fd93eeef5c0   140571040478656
rbp            0x7fd93f30f020   0x7fd93f30f020 <stack_cache>
rsp            0x7fd9357f9608   0x7fd9357f9608
r8             0x7fd93f30f010   140571044802576
r9             0x159a   5530
r10            0x7fd93eb23700   140571036497664
r11            0x7fd9357fa700   140570882189056
r12            0x0      0
r13            0x0      0
r14            0x7fd93f749000   140571049234432
r15            0x7fd9357f99e0   140570882185696
rip            0x7fd93f0ff3a0   0x7fd93f0ff3a0 <__reclaim_stacks+528>
eflags         0x287    [ CF PF SF IF ]
cs             0x33     51
ss             0x2b     43
ds             0x0      0
es             0x0      0
fs             0x0      0
gs             0x0      0
(gdb) n
897       list_for_each (runp, &stack_cache)
(gdb) n
900           curp->pid = self->pid;
(gdb) p curp
$8 = (struct pthread *) 0x7fd90f7fe700
(gdb) n
897       list_for_each (runp, &stack_cache)
(gdb) n
900           curp->pid = self->pid;
(gdb) p curp
$9 = (struct pthread *) 0x7fd917fff700
(gdb) n
897       list_for_each (runp, &stack_cache)
(gdb) n
900           curp->pid = self->pid;
(gdb) p curp
$10 = (struct pthread *) 0x7fd934ff9700
(gdb) n
897       list_for_each (runp, &stack_cache)
(gdb)
900           curp->pid = self->pid;
(gdb) p curp
$11 = (struct pthread *) 0x7fd9357fa700
(gdb) p stack_cache
$12 = {next = 0x11940177fe9c0, prev = 0x7fd9177fe9c0}
(gdb) q
A debugging session is active.

        Inferior 1 [process 72000] will be detached.

(gdb) p stack_cache
$1 = {next = 0x11940177fe9c0, prev = 0x7fd9177fe9c0}
(gdb) p stack_cache.next
$2 = (struct list_head *) 0x11940177fe9c0
(gdb) p stack_cache.next->next
Cannot access memory at address 0x11940177fe9c0   //0x11940177fe9c0(0x7fd9177fe9c0)
Comment 4 buque 2020-06-12 02:39:01 UTC
    Hi, your analysis is exactly what I think.
    
    We install a device with glibc 2.17(centos 7.5), process B deadloop in line 2-6, cpu core is 100%. 
    It's seems like crash at first, the bad address 0x11940177fe9c0 is 0x7fd9177fe9c0,line 5 damaged this pointer later(self-pid=72000)。It's amazing that not crash, I guss there is a ring.
    As your says, it is very hard to reproduce the problem. glibc2.17 had used sereval years and only one time, I will reproduce with white box test in a few days.
    I think it's hard to solve this bug with lock-free manner, it can't stop reading and writting, this bring intermediate state. Maybe you have a better way.


(gdb) p 0x11940
$1 = 72000

1  /* Reset the PIDs in any cached stacks.  */
2  list_for_each (runp, &stack_cache)
3    {
4      struct pthread *curp = list_entry (runp, struct pthread, list);
5      curp->pid = self->pid;
6    }

Detaching from program: /usr/bin/sysmonitor, process 72000
[root@cn-north-4b-CloudDataCompassSurfer-010077236019 ~]#

(gdb) info r
rax            0x7fd9167fc9c0   140570362104256
rbx            0x7fd93f30f010   140571044802576
rcx            0x7fd9177fe9c0   140570378889664
rdx            0x11940  72000
rsi            0x7fd915ffb9c0   140570353711552
rdi            0x7fd93eeef5c0   140571040478656
rbp            0x7fd93f30f020   0x7fd93f30f020 <stack_cache>
rsp            0x7fd9357f9608   0x7fd9357f9608
r8             0x7fd93f30f010   140571044802576
r9             0x159a   5530
r10            0x7fd93eb23700   140571036497664
r11            0x7fd9357fa700   140570882189056
r12            0x0      0
r13            0x0      0
r14            0x7fd93f749000   140571049234432
r15            0x7fd9357f99e0   140570882185696
rip            0x7fd93f0ff3aa   0x7fd93f0ff3aa <__reclaim_stacks+538>
eflags         0x287    [ CF PF SF IF ]
cs             0x33     51
ss             0x2b     43
ds             0x0      0
es             0x0      0
fs             0x0      0
gs             0x0      0
(gdb) n
900           curp->pid = self->pid;
(gdb) p curp
$3 = (struct pthread *) 0x7fd916ffd700
(gdb) p stack_cache
$4 = {next = 0x11940177fe9c0, prev = 0x7fd9177fe9c0}
(gdb) p stack_cache.prev
$5 = (struct list_head *) 0x7fd9177fe9c0
(gdb) p stack_cache.prev->prev
$6 = (struct list_head *) 0x7fd93f30f020 <stack_cache>
(gdb) p curp
$7 = (struct pthread *) 0x7fd916ffd700
(gdb) i r
rax            0x7fd916ffd9c0   140570370496960
rbx            0x7fd93f30f010   140571044802576
rcx            0x7fd9177fe9c0   140570378889664
rdx            0x11940  72000
rsi            0x7fd915ffb9c0   140570353711552
rdi            0x7fd93eeef5c0   140571040478656
rbp            0x7fd93f30f020   0x7fd93f30f020 <stack_cache>
rsp            0x7fd9357f9608   0x7fd9357f9608
r8             0x7fd93f30f010   140571044802576
r9             0x159a   5530
r10            0x7fd93eb23700   140571036497664
r11            0x7fd9357fa700   140570882189056
r12            0x0      0
r13            0x0      0
r14            0x7fd93f749000   140571049234432
r15            0x7fd9357f99e0   140570882185696
rip            0x7fd93f0ff3a0   0x7fd93f0ff3a0 <__reclaim_stacks+528>
eflags         0x287    [ CF PF SF IF ]
cs             0x33     51
ss             0x2b     43
ds             0x0      0
es             0x0      0
fs             0x0      0
gs             0x0      0
(gdb) n
897       list_for_each (runp, &stack_cache)
(gdb) n
900           curp->pid = self->pid;
(gdb) p curp
$8 = (struct pthread *) 0x7fd90f7fe700
(gdb) n
897       list_for_each (runp, &stack_cache)
(gdb) n
900           curp->pid = self->pid;
(gdb) p curp
$9 = (struct pthread *) 0x7fd917fff700
(gdb) n
897       list_for_each (runp, &stack_cache)
(gdb) n
900           curp->pid = self->pid;
(gdb) p curp
$10 = (struct pthread *) 0x7fd934ff9700
(gdb) n
897       list_for_each (runp, &stack_cache)
(gdb)
900           curp->pid = self->pid;
(gdb) p curp
$11 = (struct pthread *) 0x7fd9357fa700
(gdb) p stack_cache
$12 = {next = 0x11940177fe9c0, prev = 0x7fd9177fe9c0}
(gdb) q
A debugging session is active.

        Inferior 1 [process 72000] will be detached.

(gdb) p stack_cache
$1 = {next = 0x11940177fe9c0, prev = 0x7fd9177fe9c0}
(gdb) p stack_cache.next
$2 = (struct list_head *) 0x11940177fe9c0
(gdb) p stack_cache.next->next
Cannot access memory at address 0x11940177fe9c0   //0x11940177fe9c0(0x7fd9177fe9c0)