This is the mail archive of the elfutils-devel@sourceware.org mailing list for the elfutils project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [patch v7 3/5] x86* unwinder: libdwfl/

From: Jan Kratochvil <jan dot kratochvil at redhat dot com>
To: elfutils-devel at lists dot fedorahosted dot org
Date: Thu, 24 Oct 2013 21:54:37 +0200
Subject: Re: [patch v7 3/5] x86* unwinder: libdwfl/
On Wed, 23 Oct 2013 13:19:07 +0200, Mark Wielaard wrote:
> No, I missed that __libdwfl_attach_state_for_core already does the above
> checks and we shouldn't get here if they failed. The asserts are
> correct. But maybe add a comment about __libdwfl_attach_state_for_core
> has already checked this?

Added:
  /* __libdwfl_attach_state_for_core already verified the note is there.  */


> > No.  core_next_thread has already read the same number to figure out the TID.
> > Here the same core number is just read a second time.
> 
> Right, I see it now. OK. Maybe another comment?

Added:
  /* core_next_thread already found this TID there.  */


> Sorry about being a little paranoid about the asserts.
> It really is a pain if they could accidentally happen on bad input. So I
> am really just trying to make sure they can/should really never fail.

That's fine, you have found many cases of incorrect error handling in my code.


> > > > +static bool
> > > > +ptrace_attach (pid_t tid)
> > > > +{
> > > > +  if (ptrace (PTRACE_ATTACH, tid, NULL, NULL) != 0)
> > > > +    return false;
> > > > +  /* FIXME: Handle missing SIGSTOP on old Linux kernels.  */
> > > 
> > > How old are these kernels?
> > > If this is before say 2.6.18 kernels (7 years ago now) then I
> > > think it isn't too urgent.
> > 
> > It happens in RHEL-6.  It is only very recent change where PTRACE_ATTACH can
> > no longer hang.
> 
> What a pain :{
> 
> So the problem is that sometimes the kernel forgets to send a SIGSTOP to
> the child after the PTRACE_ATTACH? Does the waitpid then wait for ever
> or does it just get something else then a SIGSTOP instead? Is there a
> workaround?

One can find it in gdb/linux-nat.c:

  if (linux_proc_pid_is_stopped (pid))
    {
      if (debug_linux_nat)
        fprintf_unfiltered (gdb_stdlog,
                            "LNPAW: Attaching to a stopped process\n");

      /* The process is definitely stopped.  It is in a job control
         stop, unless the kernel predates the TASK_STOPPED /
         TASK_TRACED distinction, in which case it might be in a
         ptrace stop.  Make sure it is in a ptrace stop; from there we
         can kill it, signal it, et cetera.

         First make sure there is a pending SIGSTOP.  Since we are
         already attached, the process can not transition from stopped
         to running without a PTRACE_CONT; so we know this signal will
         go into the queue.  The SIGSTOP generated by PTRACE_ATTACH is
         probably already in the queue (unless this kernel is old
         enough to use TASK_STOPPED for ptrace stops); but since SIGSTOP
         is not an RT signal, it can only be queued once.  */
      kill_lwp (pid, SIGSTOP);

      /* Finally, resume the stopped process.  This will deliver the SIGSTOP
         (or a higher priority signal, just like normal PTRACE_ATTACH).  */
      ptrace (PTRACE_CONT, pid, 0, 0);
    }

The problem is that PTRACE_ATTACH generates SIGSTOP which is normally caught
by waitpid.  But the same SIGSTOP one can send before PTRACE_ATTACHing.  That
is also fine.  The problem is when one sends SIGSTOP before PTRACE_ATTACHing,
someone waits on that, gets the SIGSTOP notification, then one
PTRACE_ATTACHes, PTRACE_ATTACH should generate new SIGSTOP - but SIGSTOPs do
not nest/count, they are not realtime-signal - and as the process is still
stopped the new SIGSTOP will not generate new notification.  Therefore waitpid
after that PTRACE_ATTACH will hang forever, never geting the second SIGSTOP.
The "someone waits on that" part typically happens by parent bash.
This is how I have always understood it.  Fortunately it is history now, new
kernels always generate the SIGSTOP notification from PTRACE_ATTACH.

I have tested it now with:
	#include <stdio.h>
	#include <sys/ptrace.h>
	#include <assert.h>
	#include <sys/wait.h>
	#include <stdlib.h>
	int main(int argc,char **argv) {
	  setbuf(stdout,NULL);
	  assert(argc==2);
	  pid_t pid=atoi(argv[1]);
	  long l=ptrace(PTRACE_ATTACH,pid,NULL,NULL);
	  assert(l==0);
	  puts("waiting...");
	  pid_t got=waitpid(pid,NULL,0);
	  assert(got==pid);
	  puts("wait done");
	  return 0;
	}
(as I failed to build elfutils GIT easily enough on RHEL-5) and tested it
using
	sleep 1h&p=$!;sleep 1;kill -STOP $p;sleep 1;./attach $p
and on RHEL-5 kernel-2.6.18-348.12.1.el5.x86_64 it really hangs:
	[1] 12490

	[1]+  Stopped                 sleep 1h
	waiting...
	<hang>
although on RHEL-6 kernel-2.6.32-358.18.1.el6.x86_64 it works fine:
	[1] 16760

	[1]+  Stopped                 sleep 1h
	waiting...
	wait done

Still that means the SIGSTOP handling we need in elfutils for RHEL-5, added:

  if (linux_proc_pid_is_stopped (tid))
    {
      /* See gdb/linux-nat.c linux_nat_post_attach_wait.  */
      syscall (__NR_tkill, tid, SIGSTOP);
      ptrace (PTRACE_CONT, tid, NULL, NULL);
    }

plus:

static bool
linux_proc_pid_is_stopped (pid_t pid)
{
  char buffer[64];
  FILE *procfile;
  bool retval, have_state;

  snprintf (buffer, sizeof (buffer), "/proc/%ld/status", (long) pid);
  procfile = fopen (buffer, "r");
  if (procfile == NULL)
    return false;

  have_state = false;
  while (fgets (buffer, sizeof (buffer), procfile) != NULL)
    if (strncmp (buffer, "State:", 6) == 0)
      {
        have_state = true;
        break;
      }
  retval = (have_state && strstr (buffer, "T (stopped)") != NULL);
  fclose (procfile);
  return retval;
}


> I like small stack frames, so if 32 could be used that would be nice.
> But it was just a nitpick indeed. Keep it as is if that makes sure there
> is always enough room.

Kept as is.


> > @@ -147,18 +171,11 @@ static bool
> >  pid_set_initial_registers (Dwfl_Thread *thread, void *thread_arg)
> >  {
> >    struct pid_arg *pid_arg = thread_arg;
> > +  assert (! pid_arg->tid_attached);
> 
> Nitpick. I like x != 0 in case a variable isn't a bool to make clear
                  x == 0
> what we are really checking.

Changed.


> Looks good. Only slightly concerned about the missing SIGSTOP issue.

Implemented.


Thanks,
Jan

diff --git a/libdwfl/linux-core-attach.c b/libdwfl/linux-core-attach.c
index d2873a0..7e6d4da 100644
--- a/libdwfl/linux-core-attach.c
+++ b/libdwfl/linux-core-attach.c
@@ -168,6 +168,7 @@ core_set_initial_registers (Dwfl_Thread *thread, void *thread_arg_voidp)
   assert (offset < note_data->d_size);
   size_t getnote_err = gelf_getnote (note_data, offset, &nhdr, &name_offset,
 				     &desc_offset);
+  /* __libdwfl_attach_state_for_core already verified the note is there.  */
   assert (getnote_err != 0);
   /* Do not check NAME for now, help broken Linux kernels.  */
   const char *name = note_data->d_buf + name_offset;
@@ -179,6 +180,7 @@ core_set_initial_registers (Dwfl_Thread *thread, void *thread_arg_voidp)
   const Ebl_Core_Item *items;
   int core_note_err = ebl_core_note (core_arg->ebl, &nhdr, name, &regs_offset,
 				     &nregloc, &reglocs, &nitems, &items);
+  /* __libdwfl_attach_state_for_core already verified the note is there.  */
   assert (core_note_err != 0);
   assert (nhdr.n_type == NT_PRSTATUS);
   const Ebl_Core_Item *item;
@@ -194,6 +196,7 @@ core_set_initial_registers (Dwfl_Thread *thread, void *thread_arg_voidp)
     tid = (int32_t) val32;
     eu_static_assert (sizeof val32 <= sizeof tid);
   }
+  /* core_next_thread already found this TID there.  */
   assert (tid == INTUSE(dwfl_thread_tid) (thread));
   desc += regs_offset;
   for (size_t regloci = 0; regloci < nregloc; regloci++)
diff --git a/libdwfl/linux-pid-attach.c b/libdwfl/linux-pid-attach.c
index c7e5135..9d98220 100644
--- a/libdwfl/linux-pid-attach.c
+++ b/libdwfl/linux-pid-attach.c
@@ -30,6 +30,8 @@
 #include <sys/ptrace.h>
 #include <sys/wait.h>
 #include <dirent.h>
+#include <sys/syscall.h>
+#include <unistd.h>
 
 #ifndef MAX
 # define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -43,6 +45,30 @@ struct pid_arg
 };
 
 static bool
+linux_proc_pid_is_stopped (pid_t pid)
+{
+  char buffer[64];
+  FILE *procfile;
+  bool retval, have_state;
+
+  snprintf (buffer, sizeof (buffer), "/proc/%ld/status", (long) pid);
+  procfile = fopen (buffer, "r");
+  if (procfile == NULL)
+    return false;
+
+  have_state = false;
+  while (fgets (buffer, sizeof (buffer), procfile) != NULL)
+    if (strncmp (buffer, "State:", 6) == 0)
+      {
+	have_state = true;
+	break;
+      }
+  retval = (have_state && strstr (buffer, "T (stopped)") != NULL);
+  fclose (procfile);
+  return retval;
+}
+
+static bool
 ptrace_attach (pid_t tid)
 {
   if (ptrace (PTRACE_ATTACH, tid, NULL, NULL) != 0)
@@ -50,7 +76,12 @@ ptrace_attach (pid_t tid)
       __libdwfl_seterrno (DWFL_E_ERRNO);
       return false;
     }
-  /* FIXME: Handle missing SIGSTOP on old Linux kernels.  */
+  if (linux_proc_pid_is_stopped (tid))
+    {
+      /* See gdb/linux-nat.c linux_nat_post_attach_wait.  */
+      syscall (__NR_tkill, tid, SIGSTOP);
+      ptrace (PTRACE_CONT, tid, NULL, NULL);
+    }
   for (;;)
     {
       int status;
@@ -170,7 +201,7 @@ static bool
 pid_set_initial_registers (Dwfl_Thread *thread, void *thread_arg)
 {
   struct pid_arg *pid_arg = thread_arg;
-  assert (! pid_arg->tid_attached);
+  assert (pid_arg->tid_attached == 0);
   pid_t tid = INTUSE(dwfl_thread_tid) (thread);
   if (! ptrace_attach (tid))
     return false;
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]