[PATCH] Fix PR 12702 - gdb can hang waiting for thread group leader (gdbserver)

Hui Zhu teawater@gmail.com
Tue Nov 19 06:19:00 GMT 2013


On Thu, Jul 25, 2013 at 2:25 AM, Pedro Alves <palves@redhat.com> wrote:
> On 07/20/2013 10:59 AM, Hui Zhu wrote:
>> Hi,
>>
>> I got a issue with leader-exit.exp will hang with target_board=native-gdbserver.
>> I found the reason is gdbserver still have this issue.  So I post a
>> patch for gdbserver.
>
> leader-exit.exp is part of a series of tests for a set of related
> problems.  See:
>
>   http://www.sourceware.org/ml/gdb-patches/2011-10/msg00704.html
>
> Several are currently masked because gdbserver doesn't support
> fork/exec yet.
>
> We should fix this by implementing TARGET_WAITKIND_NO_RESUMED
> on gdbserver, like the patch above did for native.
>
> --
> Pedro Alves
>

Hi Pedro,

According to your comments, I make a new patch to add
TARGET_WAITKIND_NO_RESUMED to gdbserver.
It can pass the leader-exit.exp and pass the regression test.

Please help me review it.

Thanks,
Hui

2013-11-19  Hui Zhu  <hui@codesourcery.com>

* common/ptid.c (ptid_match): New.
* common/ptid.h (ptid_match): New extern.
* inferior.h (ptid_match): Removed.
* infrun.c (ptid_match): Removed.

2013-11-19  Hui Zhu  <hui@codesourcery.com>

* linux-low.c (sigchld_mask): New.
(handle_extended_wait): Add debug output.
(num_lwps): New.
(check_zombie_leaders_callback): New.
(check_zombie_leaders): New.
(not_suspended_callback): New.
(check_pending_stop): New.
(linux_wait_for_lwp): Call my_waitpid with WNOHANG.
(linux_wait_for_event): Change the return check for
linux_wait_for_lwp.
(wait_lwp): New.
(wait_for_sigstop): Call wait_lwp.
* linux-low.h (lwp_info): Add pending_stop.
* server.c (handle_target_event): Add check for
TARGET_WAITKIND_NO_RESUMED.
-------------- next part --------------
--- a/gdb/common/ptid.c
+++ b/gdb/common/ptid.c
@@ -114,3 +114,17 @@ ptid_tid_p (ptid_t ptid)
 
   return (ptid_get_tid (ptid) != 0);
 }
+
+int
+ptid_match (ptid_t ptid, ptid_t filter)
+{
+  if (ptid_equal (filter, minus_one_ptid))
+    return 1;
+  if (ptid_is_pid (filter)
+      && ptid_get_pid (ptid) == ptid_get_pid (filter))
+    return 1;
+  else if (ptid_equal (ptid, filter))
+    return 1;
+
+  return 0;
+}
--- a/gdb/common/ptid.h
+++ b/gdb/common/ptid.h
@@ -78,4 +78,14 @@ int ptid_lwp_p (ptid_t ptid);
 /* Return true if PTID's tid member is non-zero.  */
 int ptid_tid_p (ptid_t ptid);
 
+/* Returns true if PTID matches filter FILTER.  FILTER can be the wild
+   card MINUS_ONE_PTID (all ptid match it); can be a ptid representing
+   a process (ptid_is_pid returns true), in which case, all lwps and
+   threads of that given process match, lwps and threads of other
+   processes do not; or, it can represent a specific thread, in which
+   case, only that thread will match true.  PTID must represent a
+   specific LWP or THREAD, it can never be a wild card.  */
+
+extern int ptid_match (ptid_t ptid, ptid_t filter);
+
 #endif
--- a/gdb/gdbserver/linux-low.c
+++ b/gdb/gdbserver/linux-low.c
@@ -220,6 +220,9 @@ int using_threads = 1;
    jump pads).  */
 static int stabilizing_threads;
 
+/* Signals to block SIGCHLD.  */
+static sigset_t sigchld_mask;
+
 static void linux_resume_one_lwp (struct lwp_info *lwp,
 				  int step, int signal, siginfo_t *info);
 static void linux_resume (struct thread_resume *resume_info, size_t n);
@@ -407,6 +410,11 @@ handle_extended_wait (struct lwp_info *e
 	    warning ("wait returned unexpected status 0x%x", status);
 	}
 
+      if (debug_threads)
+	fprintf (stderr, "HEW: Got clone event "
+		         "from LWP %ld, new child is LWP %ld\n",
+		 lwpid_of (event_child), new_pid);
+
       ptid = ptid_build (pid_of (event_child), new_pid, 0);
       new_lwp = (struct lwp_info *) add_lwp (ptid);
       add_thread (ptid, new_lwp);
@@ -1268,30 +1276,148 @@ find_lwp_pid (ptid_t ptid)
   return (struct lwp_info*) find_inferior (&all_lwps, same_lwp, &ptid);
 }
 
+/* Return the number of known LWPs in the tgid given by PID.  */
+
+static int
+num_lwps (int pid)
+{
+  int count = 0;
+  struct inferior_list_entry *inf = all_lwps.head;
+
+  while (inf != NULL)
+    {
+      struct inferior_list_entry *next;
+
+      next = inf->next;
+      if (ptid_get_pid(inf->id) == pid)
+	count++;
+      inf = next;
+    }
+
+  return count;
+}
+
+static void
+check_zombie_leaders_callback (struct inferior_list_entry *inf)
+{
+  pid_t leader_pid = ptid_get_pid (inf->id);
+  struct lwp_info *leader_lp;
+
+  leader_lp = find_lwp_pid (inf->id);
+  if (leader_lp != NULL
+      /* Check if there are other threads in the group, as we may
+	 have raced with the inferior simply exiting.  */
+      && num_lwps (leader_pid) > 1
+      && linux_proc_pid_is_zombie (leader_pid))
+    delete_lwp (leader_lp);
+}
+
+/* Detect zombie thread group leaders, and "exit" them.  We can't reap
+   their exits until all other threads in the group have exited.  */
+
+static void
+check_zombie_leaders (void)
+{
+  for_each_inferior (&all_processes, check_zombie_leaders_callback);
+}
+
+static int
+not_suspended_callback (struct inferior_list_entry *inf, void *arg)
+{
+  ptid_t filter = *(ptid_t *)arg;
+  struct lwp_info *lwp;
+
+  if (!ptid_match (inf->id, filter))
+    return 0;
+
+  lwp = find_lwp_pid (inf->id);
+  if (lwp == NULL)
+    return 0;
+
+  if (lwp->suspended)
+    return 0;
+
+  return 1;
+}
+
+static int
+check_pending_stop (struct inferior_list_entry *inf, void *data)
+{
+  ptid_t ptid = *(ptid_t *) data;
+  struct lwp_info *lwp;
+
+  if (ptid_equal (ptid, minus_one_ptid)
+      || ptid_get_pid(inf->id) == ptid_get_pid(ptid))
+    {
+      lwp = find_lwp_pid (inf->id);
+      if (lwp->pending_stop)
+	return 1;
+    }
+
+  return 0;
+}
+
 static struct lwp_info *
 linux_wait_for_lwp (ptid_t ptid, int *wstatp, int options)
 {
-  int ret;
-  int to_wait_for = -1;
+  int ret = 0;
   struct lwp_info *child = NULL;
+  sigset_t prev_mask;
 
   if (debug_threads)
     fprintf (stderr, "linux_wait_for_lwp: %s\n", target_pid_to_str (ptid));
 
-  if (ptid_equal (ptid, minus_one_ptid))
-    to_wait_for = -1;			/* any child */
-  else
-    to_wait_for = ptid_get_lwp (ptid);	/* this lwp only */
+  /* Make sure SIGCHLD is blocked.  */
+  sigprocmask (SIG_BLOCK, &sigchld_mask, &prev_mask);
 
-  options |= __WALL;
+  /* First check if there is a LWP with a wait status pending.  */
+  if (ptid_equal (ptid, minus_one_ptid) || ptid_get_lwp (ptid) == 0)
+    child = (struct lwp_info*) find_inferior (&all_lwps, check_pending_stop,
+					      &ptid);
+  else
+    child = find_lwp_pid (ptid);
+  if (child && child->pending_stop)
+    {
+      if (debug_threads)
+	fprintf (stderr,
+		 "linux_wait_for_lwp: %s has a wait status pending.\n",
+		 target_pid_to_str (ptid_of (child)));
+      *wstatp = child->last_status;
+      child->pending_stop = 0;
+      child->stopped = 1;
+      goto got_child;
+    }
 
 retry:
+  ret = my_waitpid (-1, wstatp, __WCLONE | WNOHANG);
+  if (ret == 0 || (ret == -1 && errno == ECHILD))
+    ret = my_waitpid (-1, wstatp, WNOHANG);
 
-  ret = my_waitpid (to_wait_for, wstatp, options);
-  if (ret == 0 || (ret == -1 && errno == ECHILD && (options & WNOHANG)))
-    return NULL;
-  else if (ret == -1)
-    perror_with_name ("waitpid");
+  if (ret <= 0)
+    {
+      sigset_t suspend_mask;
+
+      /* Check for zombie thread group leaders.  Those can't be reaped
+	 until all other threads in the thread group are.  */
+      check_zombie_leaders ();
+
+      /* If there are no resumed children left, bail.  We'd be stuck
+	 forever in the sigsuspend call below otherwise.
+	 OR no interesting event to report to the core.*/
+      if ((find_inferior (&all_lwps, not_suspended_callback, &ptid) == NULL)
+	  || (options & WNOHANG))
+	{
+	  sigprocmask (SIG_SETMASK, &prev_mask, NULL);
+	  return NULL;
+	}
+
+      /* Block until we get an event reported with SIGCHLD.  */
+      sigprocmask (SIG_SETMASK, NULL, &suspend_mask);
+      sigdelset (&suspend_mask, SIGCHLD);
+      sigsuspend (&suspend_mask);
+
+      goto retry;
+    }
 
   if (debug_threads
       && (!WIFSTOPPED (*wstatp)
@@ -1408,6 +1534,24 @@ retry:
       current_inferior = saved_inferior;
     }
 
+  if (!ptid_match (ptid_of (child), ptid))
+    {
+      if (debug_threads)
+        {
+	  char child_ptid_buf[80];
+
+	  strcpy (child_ptid_buf, target_pid_to_str (ptid_of (child)));
+	  fprintf (stderr, "linux_wait_for_lwp: %s is not same with %s.\n",
+		   child_ptid_buf, target_pid_to_str (ptid));
+	}
+      child->pending_stop = 1;
+      child->stopped = 0;
+      goto retry;
+    }
+
+got_child:
+  sigprocmask (SIG_SETMASK, &prev_mask, NULL);
+
   return child;
 }
 
@@ -1844,16 +1988,13 @@ linux_wait_for_event (ptid_t ptid, int *
     {
       event_child = linux_wait_for_lwp (wait_ptid, wstat, options);
 
-      if ((options & WNOHANG) && event_child == NULL)
+      if (event_child == NULL)
 	{
 	  if (debug_threads)
-	    fprintf (stderr, "WNOHANG set, no event found\n");
+	    fprintf (stderr, "No event found\n");
 	  return 0;
 	}
 
-      if (event_child == NULL)
-	error ("event from unknown child");
-
       if (ptid_is_pid (ptid)
 	  && ptid_get_pid (ptid) != ptid_get_pid (ptid_of (event_child)))
 	{
@@ -2314,8 +2455,25 @@ retry:
       pid = linux_wait_for_event (step_over_bkpt, &w, options & ~WNOHANG);
     }
 
-  if (pid == 0) /* only if TARGET_WNOHANG */
-    return null_ptid;
+  if (pid == 0)
+    {
+      /* If there are no resumed children left, bail.  We'd be stuck
+	 forever in the sigsuspend call below otherwise.  */
+      if (find_inferior (&all_lwps, not_suspended_callback, &ptid) == NULL)
+	{
+	  ourstatus->kind = TARGET_WAITKIND_NO_RESUMED;
+	  return minus_one_ptid;
+	}
+
+      /* No interesting event to report to the core.  */
+      if (target_options & TARGET_WNOHANG)
+	{
+	  ourstatus->kind = TARGET_WAITKIND_IGNORE;
+	  return minus_one_ptid;
+	}
+
+      return null_ptid;
+    }
 
   event_child = get_thread_lwp (current_inferior);
 
@@ -2899,6 +3057,175 @@ mark_lwp_dead (struct lwp_info *lwp, int
   lwp->stop_expected = 0;
 }
 
+/* Wait for LP to stop.  Returns the wait status, or 0 if the LWP has
+   exited.  */
+
+static int
+wait_lwp (struct lwp_info *lwp)
+{
+  pid_t pid;
+  int status = 0;
+  int thread_dead = 0;
+  sigset_t prev_mask;
+
+  /* Make sure SIGCHLD is blocked for sigsuspend avoiding a race below.  */
+  sigprocmask (SIG_BLOCK, &sigchld_mask, &prev_mask);
+
+  for (;;)
+    {
+      sigset_t suspend_mask;
+
+      /* If my_waitpid returns 0 it means the __WCLONE vs. non-__WCLONE kind
+	 was right and we should just call sigsuspend.  */
+
+      pid = my_waitpid (ptid_get_lwp (lwp->head.id), &status, WNOHANG);
+      if (pid == -1 && errno == ECHILD)
+	pid = my_waitpid (ptid_get_lwp (lwp->head.id), &status, __WCLONE | WNOHANG);
+      if (pid == -1 && errno == ECHILD)
+	{
+	  /* The thread has previously exited.  We need to delete it
+	     now because, for some vendor 2.4 kernels with NPTL
+	     support backported, there won't be an exit event unless
+	     it is the main thread.  2.6 kernels will report an exit
+	     event for each thread that exits, as expected.  */
+	  thread_dead = 1;
+	  if (debug_threads)
+	    fprintf (stderr, "WL: %s vanished.\n",
+		     target_pid_to_str (lwp->head.id));
+	}
+      if (pid != 0)
+	break;
+
+      /* Bugs 10970, 12702.
+	 Thread group leader may have exited in which case we'll lock up in
+	 waitpid if there are other threads, even if they are all zombies too.
+	 Basically, we're not supposed to use waitpid this way.
+	 __WCLONE is not applicable for the leader so we can't use that.
+	 LINUX_NAT_THREAD_ALIVE cannot be used here as it requires a STOPPED
+	 process; it gets ESRCH both for the zombie and for running processes.
+
+	 As a workaround, check if we're waiting for the thread group leader and
+	 if it's a zombie, and avoid calling waitpid if it is.
+
+	 This is racy, what if the tgl becomes a zombie right after we check?
+	 Therefore always use WNOHANG with sigsuspend - it is equivalent to
+	 waiting waitpid but linux_proc_pid_is_zombie is safe this way.  */
+
+      if (ptid_get_pid (lwp->head.id) == ptid_get_lwp (lwp->head.id)
+	  && linux_proc_pid_is_zombie (ptid_get_lwp (lwp->head.id)))
+	{
+	  thread_dead = 1;
+	  if (debug_threads)
+	    fprintf (stderr, "WL: Thread group leader %s vanished.\n",
+		     target_pid_to_str (lwp->head.id));
+	  break;
+	}
+
+      /* Wait for next SIGCHLD and try again.  This may let SIGCHLD handlers
+	 get invoked despite our caller had them intentionally blocked by
+	 block_child_signals.  This is sensitive only to the loop of
+	 linux_nat_wait_1 and there if we get called my_waitpid gets called
+	 again before it gets to sigsuspend so we can safely let the handlers
+	 get executed here.  */
+
+      /* Block until we get an event reported with SIGCHLD.  */
+      sigprocmask (SIG_SETMASK, NULL, &suspend_mask);
+      sigdelset (&suspend_mask, SIGCHLD);
+      sigsuspend (&suspend_mask);
+    }
+
+  sigprocmask (SIG_SETMASK, &prev_mask, NULL);
+
+  if (!thread_dead)
+    {
+      gdb_assert (pid == ptid_get_lwp (lwp->head.id));
+
+      if (debug_threads)
+	fprintf (stderr, "WL: waitpid %s received %x\n",
+		 target_pid_to_str (lwp->head.id), status);
+
+      /* Check if the thread has exited.  */
+      if (WIFEXITED (status) || WIFSIGNALED (status))
+	{
+	  thread_dead = 1;
+	  if (debug_threads)
+	    fprintf (stderr, "WL: %s exited.\n",
+		     target_pid_to_str (lwp->head.id));
+	}
+    }
+
+  if (thread_dead)
+    {
+      delete_lwp (lwp);
+      return 0;
+    }
+
+  gdb_assert (WIFSTOPPED (status));
+
+  lwp->stopped = 1;
+  lwp->last_status = status;
+  if (WIFSTOPPED (status))
+    {
+      lwp->stop_pc = get_stop_pc (lwp);
+      if (WSTOPSIG (status) == SIGSTOP && lwp->stop_expected)
+	lwp->stop_expected = 0;
+    }
+
+  if (WIFSTOPPED (status) && WSTOPSIG (status) == SIGTRAP)
+    {
+      if (the_low_target.stopped_by_watchpoint == NULL)
+	{
+	  lwp->stopped_by_watchpoint = 0;
+	}
+      else
+	{
+	  struct thread_info *saved_inferior;
+
+	  saved_inferior = current_inferior;
+	  current_inferior = get_lwp_thread (lwp);
+
+	  lwp->stopped_by_watchpoint
+	    = the_low_target.stopped_by_watchpoint ();
+
+	  if (lwp->stopped_by_watchpoint)
+	    {
+	      if (the_low_target.stopped_data_address != NULL)
+		lwp->stopped_data_address
+		  = the_low_target.stopped_data_address ();
+	      else
+		lwp->stopped_data_address = 0;
+	    }
+
+	  current_inferior = saved_inferior;
+	}
+    }
+
+  /* Handle GNU/Linux's syscall SIGTRAPs.  */
+  if (WIFSTOPPED (status) && WSTOPSIG (status) == SYSCALL_SIGTRAP)
+    {
+      if (debug_threads)
+	fprintf (stderr, "WL: Handling syscall SIGTRAPs\n");
+      /* No longer need the sysgood bit.  The ptrace event ends up
+	 recorded in lp->waitstatus if we care for it.  We can carry
+	 on handling the event like a regular SIGTRAP from here
+	 on.  */
+      status = W_STOPCODE (SIGTRAP);
+      ptrace (PTRACE_CONT, ptid_get_lwp (lwp->head.id), 0, 0);
+      return wait_lwp (lwp);
+    }
+
+  /* Handle GNU/Linux's extended waitstatus for trace events.  */
+  if (WIFSTOPPED (status) && WSTOPSIG (status) == SIGTRAP && status >> 16 != 0)
+    {
+      if (debug_threads)
+	fprintf (stderr, "WL: Handling extended status 0x%06x\n", status);
+      handle_extended_wait (lwp, status);
+      return wait_lwp (lwp);
+    }
+
+  return status;
+}
+
 static void
 wait_for_sigstop (struct inferior_list_entry *entry)
 {
@@ -2906,8 +3233,6 @@ wait_for_sigstop (struct inferior_list_e
   struct thread_info *saved_inferior;
   int wstat;
   ptid_t saved_tid;
-  ptid_t ptid;
-  int pid;
 
   if (lwp->stopped)
     {
@@ -2923,12 +3248,12 @@ wait_for_sigstop (struct inferior_list_e
   else
     saved_tid = null_ptid; /* avoid bogus unused warning */
 
-  ptid = lwp->head.id;
-
   if (debug_threads)
     fprintf (stderr, "wait_for_sigstop: pulling one event\n");
 
-  pid = linux_wait_for_event (ptid, &wstat, __WALL);
+  wstat = wait_lwp (lwp);
+  if (wstat == 0)
+    return;
 
   /* If we stopped with a non-SIGSTOP signal, save it for later
      and record the pending SIGSTOP.  If the process exited, just
@@ -2952,9 +3277,10 @@ wait_for_sigstop (struct inferior_list_e
   else
     {
       if (debug_threads)
-	fprintf (stderr, "Process %d exited while stopping LWPs\n", pid);
+	fprintf (stderr, "Process %ld exited while stopping LWPs\n",
+		 ptid_get_lwp (lwp->head.id));
 
-      lwp = find_lwp_pid (pid_to_ptid (pid));
+      lwp = find_lwp_pid (pid_to_ptid (ptid_get_lwp (lwp->head.id)));
       if (lwp)
 	{
 	  /* Leave this status pending for the next time we're able to
@@ -5864,5 +6190,8 @@ initialize_low (void)
   sigchld_action.sa_flags = SA_RESTART;
   sigaction (SIGCHLD, &sigchld_action, NULL);
 
+  sigemptyset (&sigchld_mask);
+  sigaddset(&sigchld_mask, SIGCHLD);
+
   initialize_low_arch ();
 }
--- a/gdb/gdbserver/linux-low.h
+++ b/gdb/gdbserver/linux-low.h
@@ -253,6 +253,9 @@ struct lwp_info
      event already received in a wait()).  */
   int stopped;
 
+  /* If this flag is set, the lwp has a wait status pending.  */
+  int pending_stop;
+
   /* If this flag is set, the lwp is known to be dead already (exit
      event already received in a wait(), and is cached in
      status_pending).  */
--- a/gdb/gdbserver/server.c
+++ b/gdb/gdbserver/server.c
@@ -3595,7 +3595,8 @@ handle_target_event (int err, gdb_client
   last_ptid = mywait (minus_one_ptid, &last_status,
 		      TARGET_WNOHANG, 1);
 
-  if (last_status.kind != TARGET_WAITKIND_IGNORE)
+  if (last_status.kind != TARGET_WAITKIND_IGNORE
+      && last_status.kind != TARGET_WAITKIND_NO_RESUMED)
     {
       int pid = ptid_get_pid (last_ptid);
       struct process_info *process = find_process_pid (pid);
--- a/gdb/inferior.h
+++ b/gdb/inferior.h
@@ -65,16 +65,6 @@ extern void discard_infcall_control_stat
 extern struct regcache *
   get_infcall_suspend_state_regcache (struct infcall_suspend_state *);
 
-/* Returns true if PTID matches filter FILTER.  FILTER can be the wild
-   card MINUS_ONE_PTID (all ptid match it); can be a ptid representing
-   a process (ptid_is_pid returns true), in which case, all lwps and
-   threads of that given process match, lwps and threads of other
-   processes do not; or, it can represent a specific thread, in which
-   case, only that thread will match true.  PTID must represent a
-   specific LWP or THREAD, it can never be a wild card.  */
-
-extern int ptid_match (ptid_t ptid, ptid_t filter);
-
 /* Save value of inferior_ptid so that it may be restored by
    a later call to do_cleanups().  Returns the struct cleanup
    pointer needed for later doing the cleanup.  */
--- a/gdb/infrun.c
+++ b/gdb/infrun.c
@@ -7059,20 +7059,6 @@ discard_infcall_control_state (struct in
   xfree (inf_status);
 }
 

-int
-ptid_match (ptid_t ptid, ptid_t filter)
-{
-  if (ptid_equal (filter, minus_one_ptid))
-    return 1;
-  if (ptid_is_pid (filter)
-      && ptid_get_pid (ptid) == ptid_get_pid (filter))
-    return 1;
-  else if (ptid_equal (ptid, filter))
-    return 1;
-
-  return 0;
-}
-
 /* restore_inferior_ptid() will be used by the cleanup machinery
    to restore the inferior_ptid value saved in a call to
    save_inferior_ptid().  */


More information about the Gdb-patches mailing list