patch 5 debuginfod: prometheus metrics

Frank Ch. Eigler fche@redhat.com
Thu Nov 7 09:08:00 GMT 2019


From: "Frank Ch. Eigler" <fche@redhat.com>
Date: Thu, 7 Nov 2019 04:00:16 -0500
Subject: [PATCH 2/2] debuginfod 5: add /metrics endpoint

This webapi extensions allows admins to hook up debuginfod to a
prometheus-compatible monitoring system for general situational
statistics.  The metrics are simple enough that local curl requests
can give a user a sense of what's going on.  The metrics are
documented as unstable with respect to future versions.
---
 debuginfod/ChangeLog         |  11 ++
 debuginfod/debuginfod.8      |  13 +-
 debuginfod/debuginfod.cxx    | 242 ++++++++++++++++++++++++++---------
 tests/ChangeLog              |   5 +
 tests/run-debuginfod-find.sh |  13 +-
 5 files changed, 221 insertions(+), 63 deletions(-)

diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
index 2870b6df9392..47efb9b4a6fb 100644
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,14 @@
+2019-11-07  Frank Ch. Eigler  <fche@redhat.com>
+
+	* debuginfod.cxx: Add /metrics endpoint.  Add numerous
+	calls to new functions inc_metric/set_metric to populate
+	threadsafe map containing stats.  Add http content-type
+	response headers throughout.
+	(thread_main_*): Simplify counter/timer flow.
+	(main): Reorder web service shutdown to leave http running
+	as long as possible.
+	* debuginfod.8: Document it, add security caution.
+
 2019-11-06  Frank Ch. Eigler  <fche@redhat.com>
 
 	* debuginfod.cxx: Add new -L (symlink-following) mode.
diff --git a/debuginfod/debuginfod.8 b/debuginfod/debuginfod.8
index 6b45dee68e70..e8aa26107e45 100644
--- a/debuginfod/debuginfod.8
+++ b/debuginfod/debuginfod.8
@@ -151,7 +151,7 @@ May be repeated to increase details.  The default verbosity is 0.
 
 .\" Much of the following text is duplicated with debuginfod-find.1
 
-The debuginfod's webapi resembles ordinary file service, where a GET
+debuginfod's webapi resembles ordinary file service, where a GET
 request with a path containing a known buildid results in a file.
 Unknown buildid / request combinations result in HTTP error codes.
 This file service resemblance is intentional, so that an installation
@@ -214,6 +214,14 @@ l l.
 \../bar/foo.c AT_comp_dir=/zoo	/buildid/BUILDID/source/zoo/../bar/foo.c
 .TE
 
+.SS /metrics
+
+This endpoint returns a Prometheus formatted text/plain dump of a
+variety of statistics about the operation of the debuginfod server.
+The exact set of metrics and their meanings may change in future
+versions.  Caution: configuration information (path names, versions)
+may be disclosed.
+
 .SH DATA MANAGEMENT
 
 debuginfod stores its index in an sqlite database in a densely packed
@@ -291,7 +299,8 @@ a denial-of-service in terms of RAM, CPU, disk I/O, or network I/O.
 If this is a problem, users are advised to install debuginfod with a
 HTTPS reverse-proxy front-end that enforces site policies for
 firewalling, authentication, integrity, authorization, and load
-control.
+control.  The \fI/metrics\fP webapi endpoint is probably not
+appropriate for disclosure to the public.
 
 When relaying queries to upstream debuginfods, debuginfod \fBdoes not\fP
 include any particular security features.  It trusts that the binaries
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index 7c7a0c5d7ef5..efe8c80fa081 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -72,6 +72,7 @@ extern "C" {
 #include <cstring>
 #include <vector>
 #include <set>
+#include <map>
 #include <string>
 #include <iostream>
 #include <iomanip>
@@ -98,6 +99,14 @@ using namespace std;
 #include <sys/syscall.h>
 #endif
 
+#ifdef __linux__
+#define gettid() syscall(SYS_gettid)
+#else
+#define gettid() pthread_self()
+#endif
+
+
+
 
 // Roll this identifier for every sqlite schema incompatiblity.
 #define BUILDIDS "buildids9"
@@ -374,6 +383,13 @@ static regex_t file_exclude_regex;
 static int test_webapi_sleep; /* testing only */
 static bool traverse_logical;
 
+static void set_metric(const string& key, int64_t value);
+// static void inc_metric(const string& key);
+static void set_metric(const string& metric,
+                       const string& lname, const string& lvalue,
+                       int64_t value);
+static void inc_metric(const string& metric,
+                       const string& lname, const string& lvalue);
 
 /* Handle program arguments.  */
 static error_t
@@ -451,6 +467,7 @@ struct reportable_exception
     MHD_Response* r = MHD_create_response_from_buffer (message.size(),
                                                        (void*) message.c_str(),
                                                        MHD_RESPMEM_MUST_COPY);
+    MHD_add_response_header (r, "Content-Type", "text/plain");
     int rc = MHD_queue_response (c, code, r);
     MHD_destroy_response (r);
     return rc;
@@ -538,13 +555,7 @@ timestamp (ostream &o)
   }
 
   return o << "[" << (now2 ? now2 : "") << "] "
-           << "(" << getpid ()
-#ifdef __linux__
-           << "/" << syscall(SYS_gettid)
-#else
-           << "/" << pthread_self()
-#endif
-           << "): ";
+           << "(" << getpid () << "/" << gettid() << "): ";
 }
 
 
@@ -784,6 +795,7 @@ handle_buildid_f_match (int64_t b_mtime,
       return 0;
     }
 
+  inc_metric ("http_responses_total","result","file");
   struct MHD_Response* r = MHD_create_response_from_fd ((uint64_t) s.st_size, fd);
   if (r == 0)
     {
@@ -793,6 +805,7 @@ handle_buildid_f_match (int64_t b_mtime,
     }
   else
     {
+      MHD_add_response_header (r, "Content-Type", "application/octet-stream");
       add_mhd_last_modified (r, s.st_mtime);
       if (verbose > 1)
         obatched(clog) << "serving file " << b_source0 << endl;
@@ -889,6 +902,7 @@ handle_buildid_r_match (int64_t b_mtime,
           throw archive_exception(a, "cannot extract file");
         }
 
+      inc_metric ("http_responses_total","result","rpm");
       struct MHD_Response* r = MHD_create_response_from_fd (archive_entry_size(e), fd);
       if (r == 0)
         {
@@ -898,6 +912,7 @@ handle_buildid_r_match (int64_t b_mtime,
         }
       else
         {
+          MHD_add_response_header (r, "Content-Type", "application/octet-stream");
           add_mhd_last_modified (r, archive_entry_mtime(e));
           if (verbose > 1)
             obatched(clog) << "serving rpm " << b_source0 << " file " << b_source1 << endl;
@@ -1033,6 +1048,7 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
                                 suffix.c_str(), NULL);
   if (fd >= 0)
     {
+      inc_metric ("http_responses_total","result","upstream");
       struct stat s;
       int rc = fstat (fd, &s);
       if (rc == 0)
@@ -1040,6 +1056,7 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
           auto r = MHD_create_response_from_fd ((uint64_t) s.st_size, fd);
           if (r)
             {
+              MHD_add_response_header (r, "Content-Type", "application/octet-stream");
               add_mhd_last_modified (r, s.st_mtime);
               if (verbose > 1)
                 obatched(clog) << "serving file from upstream debuginfod/cache" << endl;
@@ -1059,11 +1076,84 @@ static struct MHD_Response* handle_buildid (const string& buildid /* unsafe */,
 
 ////////////////////////////////////////////////////////////////////////
 
+static map<string,int64_t> metrics; // arbitrary data for /metrics query
+// NB: store int64_t since all our metrics are integers; prometheus accepts double
+static mutex metrics_lock;
+
+// utility function for assembling prometheus-compatible
+// name="escaped-value" strings
+// https://prometheus.io/docs/instrumenting/exposition_formats/
+static string
+metric_label(const string& name, const string& value)
+{
+  string x = name + "=\"";
+  for (auto&& c : value)
+    switch(c)
+      {
+      case '\\': x += "\\\\"; break;
+      case '\"': x += "\\\""; break;
+      case '\n': x += "\\n"; break;
+      default: x += c; break;
+      }
+  x += "\"";
+  return x;
+}
+
+
+// add prometheus-format metric name + label tuple (if any) + value
+
+static void
+set_metric(const string& metric, int64_t value)
+{
+  unique_lock<mutex> lock(metrics_lock);
+  metrics[metric] = value;
+}
+#if 0 /* unused */
+static void
+inc_metric(const string& metric)
+{
+  unique_lock<mutex> lock(metrics_lock);
+  metrics[metric] ++;
+}
+#endif
+static void
+set_metric(const string& metric,
+           const string& lname, const string& lvalue,
+           int64_t value)
+{
+  string key = (metric + "{" + metric_label(lname, lvalue) + "}");
+  unique_lock<mutex> lock(metrics_lock);
+  metrics[key] = value;
+}
+
+static void
+inc_metric(const string& metric,
+           const string& lname, const string& lvalue)
+{
+  string key = (metric + "{" + metric_label(lname, lvalue) + "}");
+  unique_lock<mutex> lock(metrics_lock);
+  metrics[key] ++;
+}
+
+
+// and more for higher arity labels if needed
+
 
 static struct MHD_Response*
 handle_metrics ()
 {
-  throw reportable_exception("not yet implemented 2");
+  stringstream o;
+  {
+    unique_lock<mutex> lock(metrics_lock);
+    for (auto&& i : metrics)
+      o << i.first << " " << i.second << endl;
+  }
+  const string& os = o.str();
+  MHD_Response* r = MHD_create_response_from_buffer (os.size(),
+                                                     (void*) os.c_str(),
+                                                     MHD_RESPMEM_MUST_COPY);
+  MHD_add_response_header (r, "Content-Type", "text/plain");
+  return r;
 }
 
 
@@ -1120,10 +1210,14 @@ handler_cb (void * /*cls*/,
               suffix = url_copy.substr(slash3); // include the slash in the suffix
             }
 
+          inc_metric("http_requests_total", "type", artifacttype);
           r = handle_buildid(buildid, artifacttype, suffix, 0); // NB: don't care about result-fd
         }
       else if (url1 == "/metrics")
-        r = handle_metrics();
+        {
+          inc_metric("http_requests_total", "type", "metrics");
+          r = handle_metrics();
+        }
       else
         throw reportable_exception("webapi error, unrecognized /operation");
 
@@ -1136,6 +1230,7 @@ handler_cb (void * /*cls*/,
     }
   catch (const reportable_exception& e)
     {
+      inc_metric("http_responses_total","result","error");
       e.report(clog);
       return e.mhd_send_response (connection);
     }
@@ -1544,6 +1639,7 @@ scan_source_file_path (const string& dir)
                       elf_classify (fd, executable_p, debuginfo_p, buildid, sourcefiles);
                     else
                       throw libc_exception(errno, string("open ") + rps);
+                    inc_metric ("scanned_total","source","file");
                   }
 
                 // NB: we catch exceptions from elf_classify here too, so that we can
@@ -1694,26 +1790,33 @@ thread_main_scan_source_file_path (void* arg)
 
   unsigned rescan_timer = 0;
   sig_atomic_t forced_rescan_count = 0;
+  set_metric("thread_timer_max", "file", dir, rescan_s);
+  set_metric("thread_pid", "file", dir, gettid());
   while (! interrupted)
     {
-      try
-        {
-          if (rescan_timer == 0)
-            scan_source_file_path (dir);
-          else if (sigusr1 != forced_rescan_count)
-            {
-              forced_rescan_count = sigusr1;
-              scan_source_file_path (dir);
-            }
-        }
-      catch (const sqlite_exception& e)
+      set_metric("thread_timer", "file", dir, rescan_timer);
+      set_metric("thread_forced_total", "file", dir, forced_rescan_count);
+      if (rescan_s && rescan_timer > rescan_s)
+        rescan_timer = 0;
+      if (sigusr1 != forced_rescan_count)
         {
-          obatched(cerr) << e.message << endl;
+          forced_rescan_count = sigusr1;
+          rescan_timer = 0;
         }
+      if (rescan_timer == 0)
+        try
+          {
+            set_metric("thread_working", "file", dir, time(NULL));
+            inc_metric("thread_work_total", "file", dir);
+            scan_source_file_path (dir);
+            set_metric("thread_working", "file", dir, 0);
+          }
+        catch (const sqlite_exception& e)
+          {
+            obatched(cerr) << e.message << endl;
+          }
       sleep (1);
       rescan_timer ++;
-      if (rescan_s)
-        rescan_timer %= rescan_s;
     }
 
   return 0;
@@ -2021,6 +2124,7 @@ scan_source_rpm_path (const string& dir)
                                   f->fts_statp->st_mtime,
                                   my_fts_executable, my_fts_debuginfo, my_fts_sref, my_fts_sdef,
                                   my_fts_sref_complete_p);
+                    inc_metric ("scanned_total","source","rpm");
                   }
                 catch (const reportable_exception& e)
                   {
@@ -2097,26 +2201,33 @@ thread_main_scan_source_rpm_path (void* arg)
 
   unsigned rescan_timer = 0;
   sig_atomic_t forced_rescan_count = 0;
+  set_metric("thread_timer_max", "rpm", dir, rescan_s);
+  set_metric("thread_pid", "rpm", dir, gettid());
   while (! interrupted)
     {
-      try
+      set_metric("thread_timer", "rpm", dir, rescan_timer);
+      set_metric("thread_forced_total", "rpm", dir, forced_rescan_count);
+      if (rescan_s && rescan_timer > rescan_s)
+        rescan_timer = 0;
+      if (sigusr1 != forced_rescan_count)
         {
-          if (rescan_timer == 0)
-            scan_source_rpm_path (dir);
-          else if (sigusr1 != forced_rescan_count)
-            {
-              forced_rescan_count = sigusr1;
-              scan_source_rpm_path (dir);
-            }
-        }
-      catch (const sqlite_exception& e)
-        {
-          obatched(cerr) << e.message << endl;
+          forced_rescan_count = sigusr1;
+          rescan_timer = 0;
         }
+      if (rescan_timer == 0)
+        try
+          {
+            set_metric("thread_working", "rpm", dir, time(NULL));
+            inc_metric("thread_work_total", "rpm", dir);
+            scan_source_rpm_path (dir);
+            set_metric("thread_working", "rpm", dir, 0);
+          }
+        catch (const sqlite_exception& e)
+          {
+            obatched(cerr) << e.message << endl;
+          }
       sleep (1);
       rescan_timer ++;
-      if (rescan_s)
-        rescan_timer %= rescan_s;
     }
 
   return 0;
@@ -2144,6 +2255,10 @@ database_stats_report()
         << " "
         << (sqlite3_column_text(ps_query, 1) ?: (const unsigned char*) "NULL")
         << endl;
+
+      set_metric("groom", "statistic",
+                 ((const char*) sqlite3_column_text(ps_query, 0) ?: (const char*) "NULL"),
+                 (sqlite3_column_double(ps_query, 1)));
     }
 }
 
@@ -2217,26 +2332,33 @@ thread_main_groom (void* /*arg*/)
 {
   unsigned groom_timer = 0;
   sig_atomic_t forced_groom_count = 0;
+  set_metric("thread_timer_max", "role", "groom", groom_s);
+  set_metric("thread_pid", "role", "groom", gettid());
   while (! interrupted)
     {
-      try
+      set_metric("thread_timer", "role", "groom", groom_timer);
+      set_metric("thread_forced_total", "role", "groom", forced_groom_count);      
+      if (groom_s && groom_timer > groom_s)
+        groom_timer = 0;
+      if (sigusr2 != forced_groom_count)
         {
-          if (groom_timer == 0)
-            groom ();
-          else if (sigusr2 != forced_groom_count)
-            {
-              forced_groom_count = sigusr2;
-              groom ();
-            }
-        }
-      catch (const sqlite_exception& e)
-        {
-          obatched(cerr) << e.message << endl;
+          forced_groom_count = sigusr2;
+          groom_timer = 0;
         }
+      if (groom_timer == 0)
+        try
+          {
+            set_metric("thread_working", "role", "groom", time(NULL));
+            inc_metric("thread_work_total", "role", "groom");
+            groom ();
+            set_metric("thread_working", "role", "groom", 0);
+          }
+        catch (const sqlite_exception& e)
+          {
+            obatched(cerr) << e.message << endl;
+          }
       sleep (1);
       groom_timer ++;
-      if (groom_s)
-        groom_timer %= groom_s;
     }
 
   return 0;
@@ -2455,6 +2577,9 @@ main (int argc, char *argv[])
   obatched(clog) << "search concurrency " << concurrency << endl;
   obatched(clog) << "rescan time " << rescan_s << endl;
   obatched(clog) << "groom time " << groom_s << endl;
+  const char* du = getenv(DEBUGINFOD_URLS_ENV_VAR);
+  if (du && du[0] != '\0') // set to non-empty string?
+    obatched(clog) << "upstream debuginfod servers: " << du << endl;
 
   vector<pthread_t> source_file_scanner_threads;
   vector<pthread_t> source_rpm_scanner_threads;
@@ -2484,28 +2609,25 @@ main (int argc, char *argv[])
         source_rpm_scanner_threads.push_back(pt);
     }
 
-
-  const char* du = getenv(DEBUGINFOD_URLS_ENV_VAR);
-  if (du && du[0] != '\0') // set to non-empty string?
-    obatched(clog) << "upstream debuginfod servers: " << du << endl;
-
   /* Trivial main loop! */
+  set_metric("ready", 1);
   while (! interrupted)
     pause ();
+  set_metric("ready", 0);
 
   if (verbose)
     obatched(clog) << "stopping" << endl;
 
-  /* Stop all the web service threads. */
-  if (d4) MHD_stop_daemon (d4);
-  if (d6) MHD_stop_daemon (d6);
-
   /* Join any source scanning threads. */
   for (auto&& it : source_file_scanner_threads)
     pthread_join (it, NULL);
   for (auto&& it : source_rpm_scanner_threads)
     pthread_join (it, NULL);
   pthread_join (groom_thread, NULL);
+  
+  /* Stop all the web service threads. */
+  if (d4) MHD_stop_daemon (d4);
+  if (d6) MHD_stop_daemon (d6);
 
   /* With all threads known dead, we can clean up the global resources. */
   delete scan_concurrency_sem;
diff --git a/tests/ChangeLog b/tests/ChangeLog
index 3d50ee8623ee..156a693f8886 100644
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,8 @@
+2019-11-07  Frank Ch. Eigler  <fche@redhat.com>
+
+	* run-debuginfod-find.sh: Test debuginfod metrics via curl.
+	Fix federated testing, asserted by metrics.
+
 2019-11-06  Frank Ch. Eigler  <fche@redhat.com>
 
 	* run-debuginfod-find.sh: Test debuginfod -L mode.  Drop
diff --git a/tests/run-debuginfod-find.sh b/tests/run-debuginfod-find.sh
index 644901073d75..4c3e3cb306c2 100755
--- a/tests/run-debuginfod-find.sh
+++ b/tests/run-debuginfod-find.sh
@@ -181,7 +181,8 @@ sleep 3
 
 # have clients contact the new server
 export DEBUGINFOD_URLS=http://localhost:$PORT2
-testrun ${abs_builddir}/debuginfod_build_id_find -e F/prog 1
+rm -rf $DEBUGINFOD_CACHE_PATH
+testrun ${abs_top_builddir}/debuginfod/debuginfod-find debuginfo $BUILDID
 
 # confirm that first server can't resolve symlinked info in L/ but second can
 BUILDID=`env LD_LIBRARY_PATH=$ldpath ${abs_builddir}/../src/readelf \
@@ -202,6 +203,16 @@ export DEBUGINFOD_URLS="BAD http://localhost:$PORT1 localhost:$PORT1 http://loca
 
 testrun ${abs_builddir}/debuginfod_build_id_find -e F/prog2 1
 
+########################################################################
+
+# Fetch some metrics, if curl program is installed
+if which curl 2>/dev/null; then
+    curl http://localhost:$PORT1/badapi
+    curl http://localhost:$PORT1/metrics
+    curl http://localhost:$PORT2/metrics
+    curl http://localhost:$PORT1/metrics | grep -q 'http_responses_total.*result.*error'
+    curl http://localhost:$PORT2/metrics | grep -q 'http_responses_total.*result.*upstream'
+fi
 
 ########################################################################
 
-- 
2.21.0



More information about the Elfutils-devel mailing list