[PATCH] follow-exec: handle targets that don't have thread exit events

Tue Mar 3 01:37:00 GMT 2015

On 12/06/2014 12:55 AM, Breazeal, Don wrote:

> I walked through this, and it makes sense to me.  We know that
> on entry to follow_exec inferior_thread() is the event thread,
> which is also the leader thread, right? Thanks for digging into this.

Yep.

Funny thing, while looking at your fork series today (need to
play with it a bit more), and recalled this patch.  And then
tonight I rebased my all-stop-on-top-of-non-stop series on current
mainline and noticed that GDB crashed while running thread-execl.exp.
Running the test in (real) non-stop mode with current mainline
under Valgrind shows the same exact same invalid reads...  So this
isn't specific to remote debugging only and can be triggered today.
Given that, I've now extended the thread-execl.exp test to run in
non-stop mode too, tweaked the comment in infrun a bit, and pushed
it in, as below.

---
>From 95e50b2723eba05ca34e9ea69c1de63e65ce9578 Mon Sep 17 00:00:00 2001
From: Pedro Alves <palves@redhat.com>
Date: Tue, 3 Mar 2015 01:25:17 +0000
Subject: [PATCH] follow-exec: delete all non-execing threads

This fixes invalid reads Valgrind first caught when debugging against
a GDBserver patched with a series that adds exec events to the remote
protocol.  Like these, using the gdb.threads/thread-execl.exp test:

$ valgrind ./gdb -data-directory=data-directory ./testsuite/gdb.threads/thread-execl  -ex "tar extended-remote :9999" -ex "b thread_execler" -ex "c" -ex "set scheduler-locking on"
...
Breakpoint 1, thread_execler (arg=0x0) at src/gdb/testsuite/gdb.threads/thread-execl.c:29
29        if (execl (image, image, NULL) == -1)
(gdb) n
Thread 32509.32509 is executing new program: build/gdb/testsuite/gdb.threads/thread-execl
[New Thread 32509.32532]
==32510== Invalid read of size 4
==32510==    at 0x5AA7D8: delete_breakpoint (breakpoint.c:13989)
==32510==    by 0x6285D3: delete_thread_breakpoint (thread.c:100)
==32510==    by 0x628603: delete_step_resume_breakpoint (thread.c:109)
==32510==    by 0x61622B: delete_thread_infrun_breakpoints (infrun.c:2928)
==32510==    by 0x6162EF: for_each_just_stopped_thread (infrun.c:2958)
==32510==    by 0x616311: delete_just_stopped_threads_infrun_breakpoints (infrun.c:2969)
==32510==    by 0x616C96: fetch_inferior_event (infrun.c:3267)
==32510==    by 0x63A2DE: inferior_event_handler (inf-loop.c:57)
==32510==    by 0x4E0E56: remote_async_serial_handler (remote.c:11877)
==32510==    by 0x4AF620: run_async_handler_and_reschedule (ser-base.c:137)
==32510==    by 0x4AF6F0: fd_event (ser-base.c:182)
==32510==    by 0x63806D: handle_file_event (event-loop.c:762)
==32510==  Address 0xcf333e0 is 16 bytes inside a block of size 200 free'd
==32510==    at 0x4A07577: free (in /usr/lib64/valgrind/vgpreload_memcheck-amd64-linux.so)
==32510==    by 0x77CB74: xfree (common-utils.c:98)
==32510==    by 0x5AA954: delete_breakpoint (breakpoint.c:14056)
==32510==    by 0x5988BD: update_breakpoints_after_exec (breakpoint.c:3765)
==32510==    by 0x61360F: follow_exec (infrun.c:1091)
==32510==    by 0x6186FA: handle_inferior_event (infrun.c:4061)
==32510==    by 0x616C55: fetch_inferior_event (infrun.c:3261)
==32510==    by 0x63A2DE: inferior_event_handler (inf-loop.c:57)
==32510==    by 0x4E0E56: remote_async_serial_handler (remote.c:11877)
==32510==    by 0x4AF620: run_async_handler_and_reschedule (ser-base.c:137)
==32510==    by 0x4AF6F0: fd_event (ser-base.c:182)
==32510==    by 0x63806D: handle_file_event (event-loop.c:762)
==32510==
[Switching to Thread 32509.32532]

Breakpoint 1, thread_execler (arg=0x0) at src/gdb/testsuite/gdb.threads/thread-execl.c:29
29        if (execl (image, image, NULL) == -1)
(gdb)

The breakpoint in question is the step-resume breakpoint of the
non-main thread, the one that was "next"ed.

The exact same issue can be seen on mainline with native debugging, by
running the thread-execl.exp test in non-stop mode, because the kernel
doesn't report a thread exit event for the execing thread.

Tested on x86_64 Fedora 20.

gdb/ChangeLog:
2015-03-02  Pedro Alves  <palves@redhat.com>

	* infrun.c (follow_exec): Delete all threads of the process except
	the event thread.  Extended comments.

gdb/testsuite/ChangeLog:
2015-03-02  Pedro Alves  <palves@redhat.com>

	* gdb.threads/thread-execl.exp (do_test): Handle non-stop.
	(top level): Call do_test with non-stop as well.
---
 gdb/ChangeLog                              |  5 ++++
 gdb/testsuite/ChangeLog                    |  5 ++++
 gdb/infrun.c                               | 48 ++++++++++++++++++++++--------
 gdb/testsuite/gdb.threads/thread-execl.exp | 24 +++++++++++++--
 4 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/gdb/ChangeLog b/gdb/ChangeLog
index 922b1d9..68c55c1 100644
--- a/gdb/ChangeLog
+++ b/gdb/ChangeLog
@@ -1,3 +1,8 @@
+2015-03-02  Pedro Alves  <palves@redhat.com>
+
+	* infrun.c (follow_exec): Delete all threads of the process except
+	the event thread.  Extended comments.
+
 2015-03-02  Joel Brobecker  <brobecker@adacore.com>
 
 	* contrib/ari/gdb_ari.sh: Reinstate checks for "true" and "false".
diff --git a/gdb/testsuite/ChangeLog b/gdb/testsuite/ChangeLog
index 76ed5e2..3b7bf7d 100644
--- a/gdb/testsuite/ChangeLog
+++ b/gdb/testsuite/ChangeLog
@@ -1,5 +1,10 @@
 2015-03-02  Pedro Alves  <palves@redhat.com>
 
+	* gdb.threads/thread-execl.exp (do_test): Handle non-stop.
+	(top level): Call do_test with non-stop as well.
+
+2015-03-02  Pedro Alves  <palves@redhat.com>
+
 	* lib/gdb.exp (gdb_test_multiple) <internal error>: Set result to
 	-1.
 
diff --git a/gdb/infrun.c b/gdb/infrun.c
index 15589b6..f87ed4c 100644
--- a/gdb/infrun.c
+++ b/gdb/infrun.c
@@ -1060,10 +1060,11 @@ show_follow_exec_mode_string (struct ui_file *file, int from_tty,
 /* EXECD_PATHNAME is assumed to be non-NULL.  */
 
 static void
-follow_exec (ptid_t pid, char *execd_pathname)
+follow_exec (ptid_t ptid, char *execd_pathname)
 {
-  struct thread_info *th = inferior_thread ();
+  struct thread_info *th, *tmp;
   struct inferior *inf = current_inferior ();
+  int pid = ptid_get_pid (ptid);
 
   /* This is an exec event that we actually wish to pay attention to.
      Refresh our symbol table to the newly exec'd program, remove any
@@ -1088,24 +1089,47 @@ follow_exec (ptid_t pid, char *execd_pathname)
 
   mark_breakpoints_out ();
 
-  update_breakpoints_after_exec ();
-
-  /* If there was one, it's gone now.  We cannot truly step-to-next
-     statement through an exec().  */
+  /* The target reports the exec event to the main thread, even if
+     some other thread does the exec, and even if the main thread was
+     stopped or already gone.  We may still have non-leader threads of
+     the process on our list.  E.g., on targets that don't have thread
+     exit events (like remote); or on native Linux in non-stop mode if
+     there were only two threads in the inferior and the non-leader
+     one is the one that execs (and nothing forces an update of the
+     thread list up to here).  When debugging remotely, it's best to
+     avoid extra traffic, when possible, so avoid syncing the thread
+     list with the target, and instead go ahead and delete all threads
+     of the process but one that reported the event.  Note this must
+     be done before calling update_breakpoints_after_exec, as
+     otherwise clearing the threads' resources would reference stale
+     thread breakpoints -- it may have been one of these threads that
+     stepped across the exec.  We could just clear their stepping
+     states, but as long as we're iterating, might as well delete
+     them.  Deleting them now rather than at the next user-visible
+     stop provides a nicer sequence of events for user and MI
+     notifications.  */
+  ALL_NON_EXITED_THREADS_SAFE (th, tmp)
+    if (ptid_get_pid (th->ptid) == pid && !ptid_equal (th->ptid, ptid))
+      delete_thread (th->ptid);
+
+  /* We also need to clear any left over stale state for the
+     leader/event thread.  E.g., if there was any step-resume
+     breakpoint or similar, it's gone now.  We cannot truly
+     step-to-next statement through an exec().  */
+  th = inferior_thread ();
   th->control.step_resume_breakpoint = NULL;
   th->control.exception_resume_breakpoint = NULL;
   th->control.single_step_breakpoints = NULL;
   th->control.step_range_start = 0;
   th->control.step_range_end = 0;
 
-  /* The target reports the exec event to the main thread, even if
-     some other thread does the exec, and even if the main thread was
-     already stopped --- if debugging in non-stop mode, it's possible
-     the user had the main thread held stopped in the previous image
-     --- release it now.  This is the same behavior as step-over-exec
-     with scheduler-locking on in all-stop mode.  */
+  /* The user may have had the main thread held stopped in the
+     previous image (e.g., schedlock on, or non-stop).  Release
+     it now.  */
   th->stop_requested = 0;
 
+  update_breakpoints_after_exec ();
+
   /* What is this a.out's name?  */
   printf_unfiltered (_("%s is executing new program: %s\n"),
 		     target_pid_to_str (inferior_ptid),
diff --git a/gdb/testsuite/gdb.threads/thread-execl.exp b/gdb/testsuite/gdb.threads/thread-execl.exp
index 4a8016c..a598ad0 100644
--- a/gdb/testsuite/gdb.threads/thread-execl.exp
+++ b/gdb/testsuite/gdb.threads/thread-execl.exp
@@ -34,9 +34,18 @@ if {[gdb_compile_pthreads "${srcdir}/${subdir}/${srcfile}" "${binfile}" \
 proc do_test { schedlock } {
     global binfile
 
-    with_test_prefix "schedlock $schedlock" {
+    if {$schedlock == "non-stop"} {
+	set prefix $schedlock
+    } else {
+	set prefix "schedlock $schedlock"
+    }
+    with_test_prefix "$prefix" {
 	clean_restart ${binfile}
 
+	if {$schedlock == "non-stop"} {
+	    gdb_test_no_output "set non-stop 1"
+	}
+
 	if ![runto_main] {
 	    return 0
 	}
@@ -45,16 +54,25 @@ proc do_test { schedlock } {
 	gdb_breakpoint "thread_execler"
 	gdb_test "continue" ".*thread_execler.*" "continue to thread start"
 
+	if {$schedlock == "non-stop"} {
+	    gdb_test "thread 2" \
+		"Switching to .*thread_execler.*" \
+		"switch to event thread"
+	}
+
 	# Now set a breakpoint at `main', and step over the execl call.  The
 	# breakpoint at main should be reached.  GDB should not try to revert
 	# back to the old thread from the old image and resume stepping it
 	# (since it is gone).
 	gdb_breakpoint "main"
-	gdb_test_no_output "set scheduler-locking $schedlock"
+
+	if {$schedlock != "non-stop"} {
+	    gdb_test_no_output "set scheduler-locking $schedlock"
+	}
 	gdb_test "next" ".*main.*" "get to main in new image"
     }
 }
 
-foreach schedlock {"off" "step" "on"} {
+foreach schedlock {"off" "step" "on" "non-stop"} {
     do_test $schedlock
 }
-- 
1.9.3