[PATCH] PR26775 interruptible/observable grooming

Frank Ch. Eigler fche@redhat.com
Thu Oct 29 18:30:36 GMT 2020


Hi -

Author: Frank Ch. Eigler <fche@redhat.com>
Date:   Thu Oct 29 14:25:18 2020 -0400

    PR26775: make grooming progress visible & interruptible
    
    On very large servers, it's desirable to be able to interrupt a rescan
    or groom cycle.  SIGUSR[12] now do that.  (Unfortunately, this is not
    practically testable in the testsuite, since these cycles are so fast
    on that small dataset.)  We also expose more internal progress count
    about the grooming pass, so the administrator can assess possible need
    to interrupt.
    
    Signed-off-by: Frank Ch. Eigler <fche@redhat.com>

diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
index 9eb9c813dd23..5af4dc6ca08f 100644
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,14 @@
+2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
+
+	PR26775
+	* debuginfod.cxx (forced_*_count): Make these global.
+	(runq::clear): New function.
+	(thread_main_scanner): Check for pending SIGUSR2; interrupt.
+	(scan_source_paths): Check for pending SIGUSR2; interrupt.
+	(groom): Report prometheus stats before groom also.  Check for
+	pending SIGUSR1; interrupt.  Increment thread_work_total for
+	each file scanned, not the entire cycle.
+
 2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
 
 	PR26810
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index 9da65d87d601..8448a501f356 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -388,7 +388,9 @@ static string db_path;
 static sqlite3 *db; // single connection, serialized across all our threads!
 static unsigned verbose;
 static volatile sig_atomic_t interrupted = 0;
+static volatile sig_atomic_t forced_rescan_count = 0;
 static volatile sig_atomic_t sigusr1 = 0;
+static volatile sig_atomic_t forced_groom_count = 0;
 static volatile sig_atomic_t sigusr2 = 0;
 static unsigned http_port = 8002;
 static unsigned rescan_s = 300;
@@ -607,6 +609,14 @@ class workq
     cv.notify_all();
   }
 
+  // clear the workqueue, when scanning is interrupted with USR2
+  void clear() {
+    unique_lock<mutex> lock(mtx);
+    q.clear();
+    set_metric("thread_work_pending","role","scan", q.size());
+    cv.notify_all(); // maybe wake up waiting idlers
+  }
+
   // block this scanner thread until there is work to do and no active
   bool wait_front (Payload& p)
   {
@@ -2741,6 +2751,12 @@ thread_main_scanner (void* arg)
         }
 
       inc_metric("thread_work_total", "role","scan");
+      
+      if (sigusr2 != forced_groom_count) // stop early if groom triggered 
+        {
+          scanq.clear();
+          break;
+        }
     }
 
   add_metric("thread_busy", "role", "scan", -1);
@@ -2784,6 +2800,9 @@ scan_source_paths()
   {
     if (interrupted) break;
 
+    if (sigusr2 != forced_groom_count) // stop early if groom triggered 
+      break;
+    
     fts_scanned ++;
 
     if (verbose > 2)
@@ -2842,7 +2861,6 @@ thread_main_fts_source_paths (void* arg)
 {
   (void) arg; // ignore; we operate on global data
 
-  sig_atomic_t forced_rescan_count = 0;
   set_metric("thread_tid", "role","traverse", tid());
   add_metric("thread_count", "role", "traverse", 1);
 
@@ -2923,6 +2941,8 @@ void groom()
   struct timeval tv_start, tv_end;
   gettimeofday (&tv_start, NULL);
 
+  database_stats_report();
+  
   // scan for files that have disappeared
   sqlite_ps files (db, "check old files", "select s.mtime, s.file, f.name from "
                        BUILDIDS "_file_mtime_scanned s, " BUILDIDS "_files f "
@@ -2951,6 +2971,10 @@ void groom()
           files_del_r_de.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
           files_del_scan.reset().bind(1,fileid).bind(2,mtime).step_ok_done();
         }
+      
+      inc_metric("thread_work_total", "role", "groom");
+      if (sigusr1 != forced_rescan_count) // stop early if scan triggered
+        break;
     }
   files.reset();
 
@@ -2987,7 +3011,6 @@ void groom()
 static void*
 thread_main_groom (void* /*arg*/)
 {
-  sig_atomic_t forced_groom_count = 0;
   set_metric("thread_tid", "role", "groom", tid());
   add_metric("thread_count", "role", "groom", 1);
 
@@ -3016,7 +3039,6 @@ thread_main_groom (void* /*arg*/)
             set_metric("thread_busy", "role", "groom", 1);
             groom ();
             last_groom = time(NULL); // NB: now was before grooming
-            inc_metric("thread_work_total", "role", "groom");
             set_metric("thread_busy", "role", "groom", 0);
           }
         catch (const sqlite_exception& e)
diff --git a/doc/ChangeLog b/doc/ChangeLog
index bde52c1157b7..8c33f1749c7f 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
+
+	PR26775
+	* debuginfod.8: Document that SIGUSR1 interrupts the groom
+	cycle, and SIGUSR2 interrupts rescan.
+
 2020-10-25  Mark Wielaard  <mark@klomp.org>
 
 	* debuginfod_find_debuginfo.3 (ECONNREFUSED): Document that this
diff --git a/doc/debuginfod.8 b/doc/debuginfod.8
index a645ceed5eaa..152e368ce5be 100644
--- a/doc/debuginfod.8
+++ b/doc/debuginfod.8
@@ -155,7 +155,8 @@ before doing it again.  A rescan for unchanged files is fast (because
 the index also stores the file mtimes).  A time of zero is acceptable,
 and means that only one initial scan should performed.  The default
 rescan time is 300 seconds.  Receiving a SIGUSR1 signal triggers a new
-scan, independent of the rescan time (including if it was zero).
+scan, independent of the rescan time (including if it was zero),
+interrupting a groom pass (if any).
 
 .TP
 .B "\-g SECONDS" "\-\-groom\-time=SECONDS"
@@ -167,7 +168,8 @@ it can deindex obsolete files.  See also the \fIDATA MANAGEMENT\fP
 section.  The default groom time is 86400 seconds (1 day).  A time of
 zero is acceptable, and means that only one initial groom should be
 performed.  Receiving a SIGUSR2 signal triggers a new grooming pass,
-independent of the groom time (including if it was zero).
+independent of the groom time (including if it was zero), interrupting
+a rescan pass (if any)..
 
 .TP
 .B "\-G"
diff --git a/tests/ChangeLog b/tests/ChangeLog
index f57db304bfeb..471479f6921e 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
+
+	PR26775
+	* run-debuginfod-find.sh: Modify test for different
+	thread_work_total semantics for grooming.
+
 2020-10-29  Frank Ch. Eigler  <fche@redhat.com>
 
 	PR26810
diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh
index 52def36438bb..3af04c00bcd6 100755
--- a/tests/run-debuginfod-find.sh
+++ b/tests/run-debuginfod-find.sh
@@ -345,8 +345,9 @@ RPM_BUILDID=d44d42cbd7d915bc938c81333a21e355a6022fb7 # in rhel6/ subdir, for a l
 rm -r R/debuginfod-rpms/rhel6/*
 kill -USR2 $PID1  # groom cycle
 # Expect 3 rpms to be deleted by the groom
-# 1 groom already took place at/soon-after startup, so -USR2 makes 2
-wait_ready $PORT1 'thread_work_total{role="groom"}' 2
+# 1 groom cycle already took place at/soon-after startup, so -USR2 makes 2
+# ... times the # of files checked in each cycle
+wait_ready $PORT1 'thread_work_total{role="groom"}' 51
 wait_ready $PORT1 'groom{statistic="file d/e"}' 3
 
 rm -rf $DEBUGINFOD_CACHE_PATH # clean it from previous tests
@@ -363,7 +364,7 @@ testrun ${abs_top_builddir}/debuginfod/debuginfod-find executable $BUILDID2
 
 # run a groom cycle to force server to drop its fdcache
 kill -USR2 $PID1  # groom cycle
-wait_ready $PORT1 'thread_work_total{role="groom"}' 3
+wait_ready $PORT1 'thread_work_total{role="groom"}' 98 # 3 complete cycles
 # move it around a couple of times to make it likely to hit a nonexistent entry during iteration
 mv R/debuginfod-rpms/rhel7 R/debuginfod-rpms/rhel7renamed
 kill -USR1 $PID1  # scan cycle



More information about the Elfutils-devel mailing list