This is the mail archive of the systemtap@sourceware.org mailing list for the systemtap project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] PR23891: Make sure stap and staprun respond to SIGTERM when stderr/stdout are blocked


When stderr/stdout are blocked (the write buffers are full), write()
syscalls in stap's signal handler and staprun's stp_main_loop() might
prevent these processes from responding to signals like SIGTERM.

Also make staprun respond to SIGPIPE just like SIGTERM.

We introduce the kill_relayfs() function to kill the reader thread in
staprun without waiting for the readers (which might alreayd be blocked
on writing to stdout).

Our local stress tests have confirmed that this patch indeed fixes the
hanging issues in stap and staprun.
---
 main.cxx           | 24 ++++++++++++++++++++++--
 staprun/mainloop.c | 45 ++++++++++++++++++++++++++++++++-------------
 staprun/relay.c    | 29 +++++++++++++++++++++++++++++
 staprun/staprun.h  |  1 +
 4 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/main.cxx b/main.cxx
index 423e2abb4..dd80a9fd8 100644
--- a/main.cxx
+++ b/main.cxx
@@ -54,6 +54,7 @@ extern "C" {
 #include <unistd.h>
 #include <wordexp.h>
 #include <ftw.h>
+#include <fcntl.h>
 }
 
 using namespace std;
@@ -277,8 +278,27 @@ void handle_interrupt (int)
   if (pending_interrupts > 2) // XXX: should be configurable? time-based?
     {
       char msg[] = "Too many interrupts received, exiting.\n";
-      int rc = write (2, msg, sizeof(msg)-1);
-      if (rc) {/* Do nothing; we don't care if our last gasp went out. */ ;}
+
+      /* NB: writing to stderr blockingly in a signal handler is dangerous
+       * since it may prevent the stap process from quitting gracefully
+       * on receiving SIGTERM/etc signals when the stderr write buffer
+       * is full. PR23891 */
+      int flags = fcntl(2, F_GETFL);
+      if (flags == -1)
+        _exit (1);
+
+      if (!(flags & O_NONBLOCK))
+        {
+          if (fcntl(2, F_SETFL, flags | O_NONBLOCK) == 0) {
+            int rc = write (2, msg, sizeof(msg)-1);
+            if (rc)
+              {
+                /* Do nothing; we don't care if our last gasp went out. */
+                ;
+              }
+          }
+        }  /* if O_NONBLOCK */
+
       _exit (1);
     }
 }
diff --git a/staprun/mainloop.c b/staprun/mainloop.c
index 0ae0aea47..b60408776 100644
--- a/staprun/mainloop.c
+++ b/staprun/mainloop.c
@@ -31,6 +31,24 @@ static int target_pid_failed_p = 0;
    main thread of interruptable events. */
 static pthread_t main_thread;
 
+static void set_nonblocking_std_fds(void)
+{
+  for (int fd = 1; fd < 3; fd++) {
+    /* NB: writing to stderr/stdout blockingly in signal handler is
+     * dangerous since it may prevent the stap process from quitting
+     * gracefully on receiving SIGTERM/etc signals when the stderr/stdout
+     * write buffer is full. PR23891 */
+    int flags = fcntl(2, F_GETFL);
+    if (flags == -1)
+      continue;
+
+    if (flags & O_NONBLOCK)
+      continue;
+
+    (void) fcntl(2, F_SETFL, flags | O_NONBLOCK);
+  }
+}
+
 static void *signal_thread(void *arg)
 {
   sigset_t *s = (sigset_t *) arg;
@@ -41,13 +59,20 @@ static void *signal_thread(void *arg)
       _perr("sigwait");
       continue;
     }
-    dbug(2, "sigproc %d (%s)\n", signum, strsignal(signum));
     if (signum == SIGQUIT) {
       load_only = 1; /* flag for stp_main_loop */
       pending_interrupts ++;
-    } else if (signum == SIGINT || signum == SIGHUP || signum == SIGTERM) {
+    } else if (signum == SIGINT || signum == SIGHUP || signum == SIGTERM
+               || signum == SIGPIPE)
+    {
       pending_interrupts ++;
     }
+    if (pending_interrupts > 2) {
+      set_nonblocking_std_fds();
+      pthread_kill (main_thread, SIGURG);
+    }
+    /* stderr is not blocking at this point */
+    dbug(2, "sigproc %d (%s)\n", signum, strsignal(signum));
   }
   /* Notify main thread (interrupts select). */
   pthread_kill (main_thread, SIGURG);
@@ -153,6 +178,7 @@ static void setup_main_signals(void)
   sigaddset(s, SIGTERM);
   sigaddset(s, SIGHUP);
   sigaddset(s, SIGQUIT);
+  sigaddset(s, SIGPIPE);
   pthread_sigmask(SIG_SETMASK, s, NULL);
   if (pthread_create(&tid, NULL, signal_thread, s) < 0) {
     _perr(_("failed to create thread"));
@@ -533,10 +559,12 @@ void cleanup_and_exit(int detach, int rc)
   /* OTOH, it may be still be running - but there's no need for
      us to wait for it, considering that the script must have exited
      for another reason.  So, we no longer   while(...wait()...);  here.
-     XXX: we could consider killing it. */
+   */
 
   if (use_old_transport)
     close_oldrelayfs(detach);
+  else if (pending_interrupts > 2)
+    kill_relayfs();
   else
     close_relayfs();
 
@@ -637,7 +665,7 @@ int stp_main_loop(void)
   struct timespec ts;
   struct timespec *timeout = NULL;
   fd_set fds;
-  sigset_t blockset, mainset;
+  sigset_t mainset;
 
 
   setvbuf(ofp, (char *)NULL, _IONBF, 0);
@@ -663,15 +691,6 @@ int stp_main_loop(void)
   res = select(control_channel + 1, NULL, NULL, &fds, &tv);
   select_supported = (res == 1 && FD_ISSET(control_channel, &fds));
   dbug(2, "select_supported: %d\n", select_supported);
-  if (select_supported) {
-    /* We block SIGURG to the main thread, except when we call
-       pselect(). This makes sure we won't miss any signals. All other
-       calls are non-blocking, so we defer till pselect() time, which
-       is when we are "sleeping". */
-    sigemptyset(&blockset);
-    sigaddset(&blockset, SIGURG);
-    pthread_sigmask(SIG_BLOCK, &blockset, &mainset);
-  }
 
   if (monitor)
       monitor_setup();
diff --git a/staprun/relay.c b/staprun/relay.c
index c87bef4ca..4340aef0c 100644
--- a/staprun/relay.c
+++ b/staprun/relay.c
@@ -468,3 +468,32 @@ void close_relayfs(void)
 	}
 	dbug(2, "done\n");
 }
+
+void kill_relayfs(void)
+{
+	int i;
+	stop_threads = 1;
+	dbug(2, "closing\n");
+	for (i = 0; i < ncpus; i++) {
+		if (reader[avail_cpus[i]])
+			pthread_kill(reader[avail_cpus[i]], SIGUSR2);
+		else
+			break;
+	}
+	for (i = 0; i < ncpus; i++) {
+		if (reader[avail_cpus[i]])
+			pthread_cancel(reader[avail_cpus[i]]); /* no wait */
+		else
+			break;
+	}
+	for (i = 0; i < ncpus; i++) {
+		if (relay_fd[avail_cpus[i]] >= 0)
+			close(relay_fd[avail_cpus[i]]);
+		else
+			break;
+	}
+	for (i = 0; i < ncpus; i++) {
+		pthread_mutex_destroy(&mutex[avail_cpus[i]]);
+	}
+	dbug(2, "done\n");
+}
diff --git a/staprun/staprun.h b/staprun/staprun.h
index 7cebd36f6..2ad78da7c 100644
--- a/staprun/staprun.h
+++ b/staprun/staprun.h
@@ -164,6 +164,7 @@ int init_ctl_channel(const char *name, int verb);
 void close_ctl_channel(void);
 int init_relayfs(void);
 void close_relayfs(void);
+void kill_relayfs(void);
 int init_oldrelayfs(void);
 void close_oldrelayfs(int);
 int write_realtime_data(void *data, ssize_t nb);
-- 
2.11.0.295.gd7dffce


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]