pthread_cancel/pthread_join deadlock

Martin Koegler martin.koegler@chello.at
Thu Dec 19 21:44:00 GMT 2013


The following test programs produces very likley a lockup, if it is started via GDB:
=============
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
#include <poll.h>

void* thread(void* arg)
{
        int i = 0;
do{
struct pollfd f;
f.fd = 0;
f.events = POLLPRI;
poll(&f, 1, 10);
 i++;
} while(i < 100);
return NULL;
}

#define CNT 5

int main()
{
void*res;
 int i;
pthread_t t[CNT];
 do{
         printf("New\n");
 for(i = 0; i < CNT; i++)
         pthread_create(&t[i], NULL, thread, 0);
         printf("Stop\n");
 for(i = 0; i < CNT; i++)
 {
   pthread_cancel(t[i]);
   pthread_join(t[i], &res);
   pthread_create(&t[i], NULL, thread, 0);
 }
 }while(1);
return 0;
}
===========

Program received signal SIGINT, Interrupt.
0x00007ffff7bc8e75 in pthread_join (threadid=140736042649344, thread_return=0x7fffffffe140) at pthread_join.c:89
89      in pthread_join.c
(gdb) info threads
  Id   Target Id         Frame 
  312  Thread 0x7fffa9d44700 (LWP 28096) "t" __libc_disable_asynccancel () at ../nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S:116
* 1    Thread 0x7ffff7fb6700 (LWP 27783) "t" 0x00007ffff7bc8e75 in pthread_join (threadid=140736042649344, thread_return=0x7fffffffe140) at pthread_join.c:89
(gdb) bt
#0  0x00007ffff7bc8e75 in pthread_join (threadid=140736042649344, thread_return=0x7fffffffe140) at pthread_join.c:89
#1  0x000000000040077e in main () at t.c:34
(gdb) info registers
rax            0xfffffffffffffe00       -512
rbx            0x7fffa9d44700   140736042649344
rcx            0xffffffffffffffff       -1
rdx            0x6dc0   28096
rsi            0x0      0
rdi            0x7fffa9d449d0   140736042650064
rbp            0x7fffffffe140   0x7fffffffe140
rsp            0x7fffffffe0b0   0x7fffffffe0b0
r8             0x0      0
r9             0x7ffff7fb6700   140737353836288
r10            0x0      0
r11            0x246    582
r12            0x7fffffffe0c0   140737488347328
r13            0x7fffa9d44d28   140736042650920
r14            0x7ffff7fb6700   140737353836288
r15            0x0      0
rip            0x7ffff7bc8e75   0x7ffff7bc8e75 <pthread_join+277>
eflags         0x246    [ PF ZF IF ]
cs             0x33     51
ss             0x2b     43
ds             0x0      0
es             0x0      0
fs             0x0      0
gs             0x0      0
(gdb) disassemble 
Dump of assembler code for function pthread_join:
   0x00007ffff7bc8d60 <+0>:     mov    %rbx,-0x28(%rsp)
   0x00007ffff7bc8d65 <+5>:     mov    %rbp,-0x20(%rsp)
   0x00007ffff7bc8d6a <+10>:    mov    %rdi,%rbx
   0x00007ffff7bc8d6d <+13>:    mov    %r12,-0x18(%rsp)
   0x00007ffff7bc8d72 <+18>:    mov    %r13,-0x10(%rsp)
   0x00007ffff7bc8d77 <+23>:    mov    %rsi,%rbp
   0x00007ffff7bc8d7a <+26>:    mov    %r14,-0x8(%rsp)
   0x00007ffff7bc8d7f <+31>:    sub    $0x58,%rsp
   0x00007ffff7bc8d83 <+35>:    test   %rdi,%rdi
   0x00007ffff7bc8d86 <+38>:    je     0x7ffff7bc8ecb <pthread_join+363>
   0x00007ffff7bc8d8c <+44>:    mov    0x2d0(%rdi),%eax
   0x00007ffff7bc8d92 <+50>:    test   %eax,%eax
   0x00007ffff7bc8d94 <+52>:    js     0x7ffff7bc8ecb <pthread_join+363>
   0x00007ffff7bc8d9a <+58>:    cmp    %rdi,0x628(%rdi)
   0x00007ffff7bc8da1 <+65>:    mov    $0x16,%eax
   0x00007ffff7bc8da6 <+70>:    je     0x7ffff7bc8e3f <pthread_join+223>
   0x00007ffff7bc8dac <+76>:    lea    0x628(%rdi),%r13
   0x00007ffff7bc8db3 <+83>:    lea    0x10(%rsp),%r12
   0x00007ffff7bc8db8 <+88>:    lea    -0x7f(%rip),%rsi        # 0x7ffff7bc8d40 <cleanup>
   0x00007ffff7bc8dbf <+95>:    mov    %fs:0x10,%r14
   0x00007ffff7bc8dc8 <+104>:   mov    %r13,%rdx
   0x00007ffff7bc8dcb <+107>:   mov    %r12,%rdi
   0x00007ffff7bc8dce <+110>:   callq  0x7ffff7bce800 <_pthread_cleanup_push>
   0x00007ffff7bc8dd3 <+115>:   callq  0x7ffff7bcebe0 <__pthread_enable_asynccancel>
   0x00007ffff7bc8dd8 <+120>:   cmp    %r14,%rbx
   0x00007ffff7bc8ddb <+123>:   mov    %eax,%r8d
   0x00007ffff7bc8dde <+126>:   je     0x7ffff7bc8e8d <pthread_join+301>
   0x00007ffff7bc8de4 <+132>:   cmp    %rbx,0x628(%r14)
   0x00007ffff7bc8deb <+139>:   je     0x7ffff7bc8e80 <pthread_join+288>
   0x00007ffff7bc8df1 <+145>:   xor    %eax,%eax
   0x00007ffff7bc8df3 <+147>:   lock cmpxchg %r14,0x0(%r13)
   0x00007ffff7bc8df9 <+153>:   jne    0x7ffff7bc8ec4 <pthread_join+356>
   0x00007ffff7bc8dff <+159>:   mov    0x2d0(%rbx),%edx
   0x00007ffff7bc8e05 <+165>:   test   %edx,%edx
   0x00007ffff7bc8e07 <+167>:   jne    0x7ffff7bc8e60 <pthread_join+256>
   0x00007ffff7bc8e09 <+169>:   mov    %r8d,%edi
   0x00007ffff7bc8e0c <+172>:   callq  0x7ffff7bcec40 <__pthread_disable_asynccancel>
   0x00007ffff7bc8e11 <+177>:   xor    %esi,%esi
   0x00007ffff7bc8e13 <+179>:   mov    %r12,%rdi
   0x00007ffff7bc8e16 <+182>:   callq  0x7ffff7bce820 <_pthread_cleanup_pop>
   0x00007ffff7bc8e1b <+187>:   test   %rbp,%rbp
   0x00007ffff7bc8e1e <+190>:   movl   $0xffffffff,0x2d0(%rbx)
   0x00007ffff7bc8e28 <+200>:   je     0x7ffff7bc8e35 <pthread_join+213>
   0x00007ffff7bc8e2a <+202>:   mov    0x630(%rbx),%rax
   0x00007ffff7bc8e31 <+209>:   mov    %rax,0x0(%rbp)
   0x00007ffff7bc8e35 <+213>:   mov    %rbx,%rdi
   0x00007ffff7bc8e38 <+216>:   callq  0x7ffff7bc7a40 <__free_tcb>
   0x00007ffff7bc8e3d <+221>:   xor    %eax,%eax
   0x00007ffff7bc8e3f <+223>:   mov    0x30(%rsp),%rbx
   0x00007ffff7bc8e44 <+228>:   mov    0x38(%rsp),%rbp
   0x00007ffff7bc8e49 <+233>:   mov    0x40(%rsp),%r12
   0x00007ffff7bc8e4e <+238>:   mov    0x48(%rsp),%r13
   0x00007ffff7bc8e53 <+243>:   mov    0x50(%rsp),%r14
   0x00007ffff7bc8e58 <+248>:   add    $0x58,%rsp
   0x00007ffff7bc8e5c <+252>:   retq   
   0x00007ffff7bc8e5d <+253>:   nopl   (%rax)
   0x00007ffff7bc8e60 <+256>:   lea    0x2d0(%rbx),%rdi
   0x00007ffff7bc8e67 <+263>:   xor    %esi,%esi
   0x00007ffff7bc8e69 <+265>:   xor    %r10,%r10
   0x00007ffff7bc8e6c <+268>:   mov    $0xca,%rax
   0x00007ffff7bc8e73 <+275>:   syscall 
=> 0x00007ffff7bc8e75 <+277>:   cmpl   $0x0,(%rdi)
   0x00007ffff7bc8e78 <+280>:   jne    0x7ffff7bc8e6c <pthread_join+268>
   0x00007ffff7bc8e7a <+282>:   jmp    0x7ffff7bc8e09 <pthread_join+169>
   0x00007ffff7bc8e7c <+284>:   nopl   0x0(%rax)
   0x00007ffff7bc8e80 <+288>:   testb  $0x3c,0x308(%rbx)
   0x00007ffff7bc8e87 <+295>:   jne    0x7ffff7bc8df1 <pthread_join+145>
   0x00007ffff7bc8e8d <+301>:   mov    0x308(%r14),%edx
   0x00007ffff7bc8e94 <+308>:   mov    $0x23,%eax
---Type <return> to continue, or q <return> to quit---
   0x00007ffff7bc8e99 <+313>:   and    $0xffffffb9,%edx
   0x00007ffff7bc8e9c <+316>:   cmp    $0x8,%edx
   0x00007ffff7bc8e9f <+319>:   je     0x7ffff7bc8df1 <pthread_join+145>
   0x00007ffff7bc8ea5 <+325>:   mov    %r8d,%edi
   0x00007ffff7bc8ea8 <+328>:   mov    %eax,0x8(%rsp)
   0x00007ffff7bc8eac <+332>:   callq  0x7ffff7bcec40 <__pthread_disable_asynccancel>
   0x00007ffff7bc8eb1 <+337>:   xor    %esi,%esi
   0x00007ffff7bc8eb3 <+339>:   mov    %r12,%rdi
   0x00007ffff7bc8eb6 <+342>:   callq  0x7ffff7bce820 <_pthread_cleanup_pop>
   0x00007ffff7bc8ebb <+347>:   mov    0x8(%rsp),%eax
   0x00007ffff7bc8ebf <+351>:   jmpq   0x7ffff7bc8e3f <pthread_join+223>
   0x00007ffff7bc8ec4 <+356>:   mov    $0x16,%eax
   0x00007ffff7bc8ec9 <+361>:   jmp    0x7ffff7bc8ea5 <pthread_join+325>
   0x00007ffff7bc8ecb <+363>:   mov    $0x3,%eax
   0x00007ffff7bc8ed0 <+368>:   jmpq   0x7ffff7bc8e3f <pthread_join+223>
End of assembler dump.
(gdb) thread 312
[Switching to thread 312 (Thread 0x7fffa9d44700 (LWP 28096))]
#0  __libc_disable_asynccancel () at ../nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S:116
116     ../nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S: No such file or directory.
(gdb) bt
#0  __libc_disable_asynccancel () at ../nptl/sysdeps/unix/sysv/linux/x86_64/cancellation.S:116
#1  0x00007ffff7906e4b in *__GI___poll (fds=<optimized out>, nfds=<optimized out>, timeout=4) at ../sysdeps/unix/sysv/linux/poll.c:89
#2  0x00000000004006e2 in thread (arg=0x0) at t.c:13
#3  0x00007ffff7bc7b50 in start_thread (arg=<optimized out>) at pthread_create.c:304
#4  0x00007ffff7911a7d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:112
#5  0x0000000000000000 in ?? ()
(gdb) info registers
rax            0xfffffffffffffe00       -512
rbx            0x7fffa9d44700   140736042649344
rcx            0xffffffffffffffff       -1
rdx            0x4      4
rsi            0x80     128
rdi            0x7fffa9d44a08   140736042650120
rbp            0x7fffa9d43ee0   0x7fffa9d43ee0
rsp            0x7fffa9d43e88   0x7fffa9d43e88
r8             0x0      0
r9             0x7fffa9d44700   140736042649344
r10            0x0      0
r11            0x202    514
r12            0x7ffff7bd21a0   140737349755296
r13            0x7fffa9d449c0   140736042650048
r14            0x7ffff7ffd040   140737354125376
r15            0x3      3
rip            0x7ffff791e5af   0x7ffff791e5af <__libc_disable_asynccancel+79>
eflags         0x202    [ IF ]
cs             0x33     51
ss             0x2b     43
ds             0x0      0
es             0x0      0
fs             0x0      0
gs             0x0      0
(gdb) disassemble 
Dump of assembler code for function __libc_disable_asynccancel:
   0x00007ffff791e560 <+0>:     test   $0x2,%edi
   0x00007ffff791e566 <+6>:     jne    0x7ffff791e58f <__libc_disable_asynccancel+47>
   0x00007ffff791e568 <+8>:     mov    %fs:0x308,%eax
   0x00007ffff791e570 <+16>:    mov    %eax,%r11d
   0x00007ffff791e573 <+19>:    and    $0xfffffffd,%r11d
   0x00007ffff791e577 <+23>:    lock cmpxchg %r11d,%fs:0x308
   0x00007ffff791e582 <+34>:    jne    0x7ffff791e570 <__libc_disable_asynccancel+16>
   0x00007ffff791e584 <+36>:    mov    %r11d,%eax
   0x00007ffff791e587 <+39>:    and    $0xc,%eax
   0x00007ffff791e58a <+42>:    cmp    $0x4,%eax
   0x00007ffff791e58d <+45>:    je     0x7ffff791e590 <__libc_disable_asynccancel+48>
   0x00007ffff791e58f <+47>:    retq   
   0x00007ffff791e590 <+48>:    mov    %fs:0x0,%rdi
   0x00007ffff791e599 <+57>:    mov    $0xca,%eax
   0x00007ffff791e59e <+62>:    xor    %r10,%r10
   0x00007ffff791e5a1 <+65>:    add    $0x308,%rdi
   0x00007ffff791e5a8 <+72>:    mov    $0x80,%esi
   0x00007ffff791e5ad <+77>:    syscall 
=> 0x00007ffff791e5af <+79>:    mov    %fs:0x308,%eax
   0x00007ffff791e5b7 <+87>:    jmp    0x7ffff791e587 <__libc_disable_asynccancel+39>
End of assembler dump.

[OS: Debian Stable]

pthread_join want to wait for a canceled thread (via futex), but the canceled thread blocks in __libc_disable_asynccancel also waiting for a futex.

Most of the time, starting it via GDB yields to a lockup in less than a minute.

Regards,
Martin
PS: Please CC me on replies.



More information about the Libc-help mailing list