Originally reported as http://bugzilla.redhat.com/show_bug.cgi?id=628608 As shown by this code, the kernel clear the robust list for the child after a fork: #include <stdio.h> #include <stddef.h> #include <stdlib.h> #include <unistd.h> #include <sys/wait.h> #include <sys/syscall.h> #include <syscall.h> int main() { void *list; size_t sz; if (syscall(__NR_get_robust_list, 0, &list, &sz)) abort(); if (!list) abort(); if (fork() == 0) { if (syscall(__NR_get_robust_list, 0, &list, &sz)) return 1; if (!list) return 2; return 0; } int status; waitpid(-1, &status, 0); if (!WIFEXITED(status)) { printf("child exited abnormally\n"); return 1; } switch(WEXITSTATUS(status)) { case 1: printf("child failed to call get_robust_list\n"); return 1; case 2: printf("child has no robust list\n"); return 1; default: printf("child exited normally\n"); } return 0; } If a parent process and child process share a robust mutex and the child exits while holding the mutex lock, when the parent tries to acquire the lock it will hang instead of being notified of the state by EOWNERDEAD. Here's a testcase which exits successfully on Solaris but deadlocks with NPTL #include <sys/types.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/wait.h> #include <pthread.h> #include <unistd.h> #include <stdlib.h> #include <errno.h> #include <stdio.h> #include <fcntl.h> void initialize(pthread_mutex_t* mtx) { pthread_mutexattr_t mtxa; if(pthread_mutexattr_init(&mtxa)) abort(); if(pthread_mutexattr_setpshared(&mtxa, PTHREAD_PROCESS_SHARED)) abort(); if(pthread_mutexattr_setrobust_np(&mtxa, PTHREAD_MUTEX_ROBUST_NP)) abort(); if(pthread_mutex_init(mtx, &mtxa)) abort(); pthread_mutexattr_destroy(&mtxa); } void set_consistent(pthread_mutex_t* mtx) { if(pthread_mutex_consistent_np(mtx)) abort(); } void lock(pthread_mutex_t* mtx) { int err; if((err = pthread_mutex_lock(mtx))) { if(EOWNERDEAD == err) { // handle abandoned mutex if(pthread_mutex_consistent_np(mtx)) abort(); } else abort(); } } void unlock(pthread_mutex_t* mtx) { if(pthread_mutex_unlock(mtx)) abort(); } pid_t spawn(int(*fn)()) { // fork a child process pid_t pid = fork(); switch(pid) { case 0: exit(fn()); case -1: abort(); default: return pid; } } char const shared_file[] = "shared_file"; void* open_shared_file() { int fd = open(shared_file, O_CREAT | O_RDWR, (mode_t)0666); if(fd < 0) abort(); struct stat st; if(fstat(fd, &st)) abort(); int new_file = !st.st_size; if (new_file) if(ftruncate(fd, sizeof(pthread_mutex_t))) abort(); void* mem = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); close(fd); if(MAP_FAILED == mem) abort(); if (new_file) initialize((pthread_mutex_t*)mem); return mem; } int process_1() { unsigned pid = getpid(); printf("%u: process 1\n", pid); pthread_mutex_t* m = (pthread_mutex_t*)open_shared_file(); printf("%u: locking mutex...\n", pid); lock(m); printf("%u: exiting\n", pid); return 0; } int process_2() { unsigned pid = getpid(); printf("%u: process 2\n", pid); pthread_mutex_t* m = (pthread_mutex_t*)open_shared_file(); printf("%u: locking mutex...\n", pid); lock(m); printf("%u: mutex locked\n", pid); unlock(m); return 0; } int main(int ac, char** av) { // fork process_1 and wait till it terminates pid_t child; unlink(shared_file); int child_status; child = spawn(process_1); if(-1 == waitpid(child, &child_status, 0)) abort(); // now do process_2 return process_2(); }
Check the code before reporting problems, this has been fixed for some time.
ah yes, 6f8326cacd08bf7d1966743086855fc36574bf74 - sorry, and thanks!
just only do setrobust syscall in child after fork? if the father is a muti-thread process, when one thread is in fork, the others might be call pthread_mutex_lock, so the robust list of father(user space) is not null, when fork is done, there will be some mutex (actually is not owned by child) in the child's robust_list. i think the mutex in robust list should be clear in child.
*** Bug 260998 has been marked as a duplicate of this bug. *** Seen from the domain http://volichat.com Page where seen: http://volichat.com/adult-chat-rooms Marked for reference. Resolved as fixed @bugzilla.