patch rfc: debuginfod -Z (generalized archive) support

Frank Ch. Eigler fche@redhat.com
Wed Feb 5 20:09:00 GMT 2020


Hi -

A little extension lets us process arch-linux archives.  Awaiting
for some small test .pkg's from the arch folks for the elfutils
testsuite.  However, hand-testing on severa larger files works!

commit b51ae89befeb81c8b51b15b7168c6e616255b486 (fche/pacman-Z)
Author: Frank Ch. Eigler <fche@redhat.com>
Date:   Wed Feb 5 15:04:18 2020 -0500

    debuginfod: generalized archive support
    
    Add a '-Z EXT=CMD' option to debuginfod, which lets it scan any given
    extension and run CMD on it to unwrap distro archives.  For example,
    for arch-linux pacman files, -Z '.tar.zst=zstdcat' lets debuginfod
    grok debug and source content in split-debuginfo files.

diff --git a/debuginfod/ChangeLog b/debuginfod/ChangeLog
index 8c97fdcf7085..d812e6d71ff0 100644
--- a/debuginfod/ChangeLog
+++ b/debuginfod/ChangeLog
@@ -1,3 +1,9 @@
+2020-02-05  Frank Ch. Eigler  <fche@redhat.com>
+
+	* debuginfod.cxx (argp options): Add -Z option.
+	(canonicalized_archive_entry_pathname): New function for
+	distro-agnostic file name matching/storage.
+
 2020-01-22  Frank Ch. Eigler  <fche@redhat.com>
 
 	* debuginfod.cxx (dwarf_extract_source_paths): Don't print
diff --git a/debuginfod/debuginfod.cxx b/debuginfod/debuginfod.cxx
index 623dbc593c70..0de6bbaea0ee 100644
--- a/debuginfod/debuginfod.cxx
+++ b/debuginfod/debuginfod.cxx
@@ -333,9 +333,10 @@ ARGP_PROGRAM_BUG_ADDRESS_DEF = PACKAGE_BUGREPORT;
 static const struct argp_option options[] =
   {
    { NULL, 0, NULL, 0, "Scanners:", 1 },
-   { "scan-file-dir", 'F', NULL, 0, "Enable ELF/DWARF file scanning threads.", 0 },
-   { "scan-rpm-dir", 'R', NULL, 0, "Enable RPM scanning threads.", 0 },
-   { "scan-deb-dir", 'U', NULL, 0, "Enable DEB scanning threads.", 0 },
+   { "scan-file-dir", 'F', NULL, 0, "Enable ELF/DWARF file scanning.", 0 },
+   { "scan-rpm-dir", 'R', NULL, 0, "Enable RPM scanning.", 0 },
+   { "scan-deb-dir", 'U', NULL, 0, "Enable DEB scanning.", 0 },
+   { "scan-archive", 'Z', "EXT=CMD", 0, "Enable arbitrary archive scanning.", 0 },
    // "source-oci-imageregistry"  ...
 
    { NULL, 0, NULL, 0, "Options:", 2 },
@@ -428,6 +429,15 @@ parse_opt (int key, char *arg,
       scan_archives[".deb"]="dpkg-deb --fsys-tarfile";
       scan_archives[".ddeb"]="dpkg-deb --fsys-tarfile";
       break;
+    case 'Z':
+      {
+        char* extension = strchr(arg, '=');
+        if (extension)
+          scan_archives[string(arg, (extension-arg))]=string(extension+1);
+        else
+          argp_failure(state, 1, EINVAL, "bad EXT=CMD format");
+      }
+      break;
     case 'L':
       traverse_logical = true;
       break;
@@ -1068,6 +1078,25 @@ class libarchive_fdcache
 static libarchive_fdcache fdcache;
 
 
+// For security/portability reasons, many distro-package archives have
+// a "./" in front of path names; others have nothing, others have
+// "/".  Canonicalize them all to a single leading "/", with the
+// assumption that this matches the dwarf-derived file names too.
+string canonicalized_archive_entry_pathname(struct archive_entry *e)
+{
+  string fn = archive_entry_pathname(e);
+  if (fn.size() == 0)
+    return fn;
+  if (fn[0] == '/')
+    return fn;
+  if (fn[0] == '.')
+    return fn.substr(1);
+  else
+    return string("/")+fn;
+}
+
+
+
 static struct MHD_Response*
 handle_buildid_r_match (int64_t b_mtime,
                         const string& b_source0,
@@ -1162,8 +1191,8 @@ handle_buildid_r_match (int64_t b_mtime,
       if (! S_ISREG(archive_entry_mode (e))) // skip non-files completely
         continue;
 
-      string fn = archive_entry_pathname (e);
-      if (fn != string(".")+b_source1)
+      string fn = canonicalized_archive_entry_pathname (e);
+      if (fn != b_source1)
         continue;
 
       // extract this file to a temporary file
@@ -2055,9 +2084,7 @@ archive_classify (const string& rps, string& archive_extension,
           if (! S_ISREG(archive_entry_mode (e))) // skip non-files completely
             continue;
 
-          string fn = archive_entry_pathname (e);
-          if (fn.size() > 1 && fn[0] == '.')
-            fn = fn.substr(1); // trim off the leading '.'
+          string fn = canonicalized_archive_entry_pathname (e);
 
           if (verbose > 3)
             obatched(clog) << "libarchive checking " << fn << endl;
@@ -2764,7 +2791,7 @@ main (int argc, char *argv[])
              "unexpected argument: %s", argv[remaining]);
 
   if (scan_archives.size()==0 && !scan_files && source_paths.size()>0)
-    obatched(clog) << "warning: without -F -R -U, ignoring PATHs" << endl;
+    obatched(clog) << "warning: without -F -R -U -Z, ignoring PATHs" << endl;
 
   fdcache.limit(fdcache_fds, fdcache_mbs);
 
@@ -2894,7 +2921,7 @@ main (int argc, char *argv[])
       obatched ob(clog);
       auto& o = ob << "scanning archive types ";
       for (auto&& arch : scan_archives)
-	o << arch.first << " ";
+	o << arch.first << "(" << arch.second << ") ";
       o << endl;
     }
   const char* du = getenv(DEBUGINFOD_URLS_ENV_VAR);
diff --git a/doc/ChangeLog b/doc/ChangeLog
index 651ea33d4106..36094d002f75 100644
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2020-02-05  Frank Ch. Eigler  <fche@redhat.com>
+
+	* debuginfod.8: Document new -Z flag and tweak other bits.
+
 2020-01-10  Mark Wielaard  <mark@klomp.org>
 
 	* debuginfod_find_debuginfo.3 (DEBUGINFOD_PROGRESS): Mention progress
diff --git a/doc/debuginfod.8 b/doc/debuginfod.8
index 166c7c4590ed..d6561edf7159 100644
--- a/doc/debuginfod.8
+++ b/doc/debuginfod.8
@@ -61,20 +61,22 @@ or
 ^C
 .ESAMPLE
 
-If the \fB\-R\fP and/or \fB-U\fP option is given, each file is scanned
-as an archive file that may contain ELF/DWARF/source files.  If \-R is
-given, the will scan RPMs; and/or if \-U is given, they will scan DEB
-/ DDEB files.  (The terms RPM and DEB and DDEB are used synonymously
-as "archives" in diagnostic messages.)  Because of complications such
-as DWZ-compressed debuginfo, may require \fItwo\fP traversal passes to
-identify all source code.  Source files for RPMs are only served from
-other RPMs, so the caution for \-F does not apply.  Note that due to
-Debian/Ubuntu packaging policies & mechanisms, debuginfod cannot
-resolve source files for DEB/DDEB at all.
-
-If no PATH is listed, or neither \fB\-F\fP nor \fB\-R\fP nor \fB\-U\fP
-option is given, then \fBdebuginfod\fP will simply serve content that
-it accumulated into its index in all previous runs.
+If any of the \fB\-R\fP, \fB-U\fP, or \fB-Z\fP options is given, each
+file is scanned as an archive file that may contain ELF/DWARF/source
+files.  Archive files are recognized by extension.  If \-R is given,
+".rpm" files are scanned; if \-D is given, ".deb" and ".ddeb" files
+are scanned; if \-Z is given, the listed extensions are scanned.
+Because of complications such as DWZ-compressed debuginfo, may require
+\fItwo\fP traversal passes to identify all source code.  Source files
+for RPMs are only served from other RPMs, so the caution for \-F does
+not apply.  Note that due to Debian/Ubuntu packaging policies &
+mechanisms, debuginfod cannot resolve source files for DEB/DDEB at
+all.
+
+If no PATH is listed, or none of the scanning options is given, then
+\fBdebuginfod\fP will simply serve content that it accumulated into
+its index in all previous runs, and federate to any upstream
+debuginfod servers.
 
 
 .SH OPTIONS
@@ -91,6 +93,16 @@ Activate RPM patterns in archive scanning.  The default is off.
 .B "\-U"
 Activate DEB/DDEB patterns in archive scanning.  The default is off.
 
+.TP
+.B "\-Z EXT=CMD"
+Activate an additional pattern in archive scanning.  Files with name
+extension EXT will be processed with CMD.  CMD is invoked with the
+file name added to its argument list, and is should produce the
+archive on its standard output.  debuginfod uses libarchive to consume
+the result, so it can accept a wide range of archive formats and
+compression.  (Include the dot in EXT.)  The default is no additional
+patterns.  This option may be repeated.
+
 .TP
 .B "\-d FILE" "\-\-database=FILE"
 Set the path of the sqlite database used to store the index.  This
@@ -123,7 +135,8 @@ against the full path of each file, based on its \fBrealpath(3)\fP
 canonicalization.  By default, all files are included and none are
 excluded.  A file that matches both include and exclude REGEX is
 excluded.  (The \fIcontents\fP of archive files are not subject to
-inclusion or exclusion filtering: they are all processed.)
+inclusion or exclusion filtering: they are all processed.)  Only the
+last of each type of regular expression given is used.
 
 .TP
 .B "\-t SECONDS"  "\-\-rescan\-time=SECONDS"



More information about the Elfutils-devel mailing list