/* -*- linux-c -*- * * staprun.c - SystemTap module loader * * Copyright (C) 2005-2019 Red Hat, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * */ #define _XOPEN_SOURCE #define _BSD_SOURCE #define _DEFAULT_SOURCE #include "staprun.h" #include "../privilege.h" #include "../runtime/k_syms.h" #include #include #include #include #include #include #include /* used in dbug, _err and _perr */ char *__name__ = "staprun"; extern long delete_module(const char *, unsigned int); int send_relocations (); int send_tzinfo (); int send_privilege_credentials (privilege_t user_credentials); int send_remote_id (); static int remove_module(const char *name, int verb); static int stap_module_inserted = -1; static void term_signal_handler(int signum __attribute ((unused))) { if (stap_module_inserted == 0) { remove_module(modname, 1); free(modname); } _exit(1); } void setup_term_signals(void) { sigset_t s; struct sigaction a; /* blocking all signals while we set things up */ sigfillset(&s); sigprocmask(SIG_SETMASK, &s, NULL); /* handle signals */ memset(&a, 0, sizeof(a)); sigfillset(&a.sa_mask); a.sa_handler = term_signal_handler; sigaction(SIGHUP, &a, NULL); sigaction(SIGINT, &a, NULL); sigaction(SIGTERM, &a, NULL); sigaction(SIGQUIT, &a, NULL); /* unblock all signals */ sigemptyset(&s); sigprocmask(SIG_SETMASK, &s, NULL); } static int run_as(int exec_p, uid_t uid, gid_t gid, const char *path, char *const argv[]) { pid_t pid; int rstatus; if (verbose >= 2) { int i = 0; eprintf(exec_p ? "execing: ": "spawning: "); while (argv[i]) { eprintf("%s ", argv[i]); i++; } eprintf("\n"); } if (exec_p) pid = 0; else pid = fork(); if (pid < 0) { _perr("fork"); return -1; } if (pid == 0) /* child process, or exec_p */ { /* Make sure we run as the full user. If we're * switching to a non-root user, this won't allow * that process to switch back to root (since the * original process is setuid). */ if (setresgid (gid, gid, gid) < 0) { _perr("setresgid"); exit(1); } if (setresuid (uid, uid, uid) < 0) { _perr("setresuid"); exit(1); } /* Actually run the command. */ if (execv(path, argv) < 0) perror(path); _exit(1); } if (waitpid(pid, &rstatus, 0) < 0) return -1; if (WIFEXITED(rstatus)) return WEXITSTATUS(rstatus); return -1; } /* * Module to be inserted has one or more user-space probes. Make sure * uprobes is enabled. * If /proc/kallsyms lists a symbol in uprobes (e.g. unregister_uprobe), * we're done. * Else try "modprobe uprobes" to load the uprobes module (if any) * built with the kernel. * If that fails, load the uprobes module built in runtime/uprobes. */ static int enable_uprobes(void) { char *argv[10]; char runtimeko[2048]; int rc; /* Formerly, we did a grep /proc/kallsyms search to see if uprobes was already loaded into the kernel. But this is a race waiting to happen. Just try to load the thing. Quietly accept a -EEXIST error. */ /* NB: don't use /sbin/modprobe, without more env. sanitation. */ /* Try the specified module or the one from the runtime. */ if (uprobes_path) snprintf (runtimeko, sizeof(runtimeko), "%s", uprobes_path); else /* NB: since PR5163, share/runtime/uprobes/uprobes.ko is not built by systemtap. */ snprintf (runtimeko, sizeof(runtimeko), "%s/uprobes/uprobes.ko", (getenv("SYSTEMTAP_RUNTIME") ?: PKGDATADIR "/runtime")); dbug(2, "Inserting uprobes module from %s.\n", runtimeko); /* This module may be signed, so use insert_module to load it. */ argv[0] = NULL; rc = insert_module(runtimeko, NULL, argv, assert_uprobes_module_permissions, NULL); if ((rc == 0) || /* OK */ (rc == -EEXIST)) /* Someone else might have loaded it */ return 0; err("Couldn't insert module '%s': %s\n", runtimeko, moderror(errno)); return 1; /* failure */ } static int insert_stap_module(privilege_t *user_credentials) { char special_options[128]; int rc, fips_mode_fd; char fips_mode = '0'; char *misc = ""; /* Add the _stp_bufsize option. */ if (snprintf_chk(special_options, sizeof (special_options), "_stp_bufsize=%d", (int)buffer_size)) return -1; fips_mode_fd = open("/proc/sys/crypto/fips_enabled", O_RDONLY); if (fips_mode_fd >= 0) { char c; rc = read(fips_mode_fd, &c, 1); if (rc == 1) fips_mode = c; close (fips_mode_fd); } /* In FIPS mode, a kernel may panic if given an improperly-signed module. Right now, we have no way of signing them with the kernel build-time keys, so we punt. See also SecureBoot. */ if ((fips_mode != '0') && !getenv("STAP_FIPS_OVERRIDE")) { errno = EPERM; stap_module_inserted = -1; misc = "in FIPS mode "; } else { stap_module_inserted = insert_module(modpath, special_options, modoptions, assert_stap_module_permissions, user_credentials); } if (stap_module_inserted != 0) err("Couldn't insert module %s'%s': %s\n", misc, modpath, moderror(errno)); return stap_module_inserted; } static void remove_all_modules(void) { char *base; struct statfs st; struct dirent *d; DIR *moddir; /* NB: nothing to do with PR14245 */ if (statfs("/sys/kernel/debug", &st) == 0 && (int)st.f_type == (int)DEBUGFS_MAGIC) base = "/sys/kernel/debug/systemtap"; else base = "/proc/systemtap"; moddir = opendir(base); if (moddir) { while ((d = readdir(moddir))) { if (strcmp(d->d_name, ".") == 0) continue; if (strcmp(d->d_name, "..") == 0) continue; relay_basedir_fd = -1; /* each time! */ if (remove_module(d->d_name, 0) == 0) printf("Module %s removed.\n", d->d_name); } closedir(moddir); } } static int remove_module(const char *name, int verb) { int i, ret; enum { MAX_EINTR_TRIES = 5 }; dbug(2, "%s\n", name); #ifdef PR_SET_NAME /* Make self easier to identify in vmcrash images */ prctl (PR_SET_NAME, "staprun-d"); #endif (void) verb; /* XXX: ignore */ if (strcmp(name, "*") == 0) { remove_all_modules(); return 0; } /* We call init_ctl_channel/close_ctl_channel to check whether the module is a systemtap-built one (having the right files), and that it's already unattached (because otherwise it'd EBUSY the opens, and that it belongs to our uid (because otherwise a faccessat(2) test on the .cmd file will fail). */ ret = init_ctl_channel (name, 0); if (ret < 0) { err("'%s' is not a zombie systemtap module.\n", name); return ret; } close_ctl_channel (); dbug(2, "removing module %s\n", name); PROBE1(staprun, remove__module, name); for (i = 0; i < MAX_EINTR_TRIES; i++) { ret = delete_module (name, O_NONBLOCK); if (ret == 0 || (errno != EINTR && errno != EWOULDBLOCK)) break; usleep(100 * i); } if (ret != 0) { err("Couldn't remove module '%s': %s.\n", name, strerror(errno)); return 1; } dbug(1, "Module %s removed.\n", name); return 0; } /* As per PR13193 & PR1548, some kernels have a buggy kprobes-optimization code, which results in BUG/panics in certain circumstances. We turn off kprobes optimization as a conservative measure, unless told otherwise by an environment variable. */ void disable_kprobes_optimization() { /* Test if the file exists at all. */ const char* proc_kprobes = "/proc/sys/debug/kprobes-optimization"; char prev; int rc, fd; struct utsname uts; /* PR13814; disable this facility for new enough kernels, containing * these fix commits: 86b4ce31 46484688 3f33ab1c */ /* PR15484; whoops, not enough, problem still seen on Debian * 3.8.12 kernel. */ if (0 && (uname (&uts) == 0) && (strverscmp (uts.release, "3.4") >= 0)) return; /* Disable kprobes optimization due to problems seen on F29 5.0 kernel. PR24416; RCU hang detection with uprobes_onthefly.exp. */ /* RHBZ1697531 - x86 kprobe optimization causes rcu hang */ if ((0 && uname (&uts) == 0) && (strverscmp (uts.release, "4.8") >= 0)) return; if (getenv ("STAP_PR13193_OVERRIDE")) return; /* See the initial state; if it's already disabled, we do nothing. */ fd = open (proc_kprobes, O_RDONLY); if (fd < 0) return; rc = read (fd, &prev, sizeof(prev)); (void) close (fd); if (rc < 1 || prev == '0') /* Already disabled or unavailable */ return; fd = open (proc_kprobes, O_WRONLY); if (fd < 0) return; prev = '0'; /* really, next */ rc = write (fd, &prev, sizeof(prev)); (void) close (fd); if (rc == 1) dbug(1, "Disabled %s.\n", proc_kprobes); else dbug(1, "Error %d/%d disabling %s.\n", rc, errno, proc_kprobes); } /* BZ1552745: /proc/sys/kernel/kptr_restrict makes /sys/module ... addresses unreliable on 2018+ kernels. circumstances. We tweak this security measure (setting it to '1'), unless told otherwise by an environment variable. We could turn it back later, but this would create a race condition between concurrent runs of staprun. The '1' setting is nominally more secure than the default '0', except that for /sys/module/$MODULE/sections/$SECTION the '0' case produces obfuscated 0-based pointers, and '1' produces good ones (to a root user). Strange but true. */ void tweak_kptr_restrict() { const char* proc_kptr = "/proc/sys/kernel/kptr_restrict"; char prev; int rc, fd; struct utsname uts; /* Relevant change appears to have been introduced in v4.15 in * commit ef0010a30935de4e0211. */ if ((uname (&uts) == 0) && (strverscmp (uts.release, "4.15") < 0)) return; if (getenv ("STAP_BZ1552745_OVERRIDE")) return; /* See the initial state; if it's already set, we do nothing. */ fd = open (proc_kptr, O_RDONLY); if (fd < 0) return; rc = read (fd, &prev, sizeof(prev)); (void) close (fd); if (rc < 1 || prev == '1') /* Already set or unavailable */ return; fd = open (proc_kptr, O_WRONLY); if (fd < 0) return; prev = '1'; /* really, next */ rc = write (fd, &prev, sizeof(prev)); (void) close (fd); if (rc == 1) dbug(1, "Set %s.\n", proc_kptr); else dbug(1, "Error %d/%d setting %s.\n", rc, errno, proc_kptr); } int init_staprun(void) { privilege_t user_credentials = pr_unknown; int rc; dbug(2, "init_staprun\n"); if (mountfs() < 0) return -1; rc = 0; if (delete_mod) exit(remove_module(modname, 1)); if (attach_mod) { /* PR14245: prime the relay_basedir_fd pump. */ rc = init_ctl_channel (modname, 0); if (rc >= 0) close_ctl_channel (); } else /* if (!attach_mod) */ { if (need_uprobes && enable_uprobes() != 0) return -1; disable_kprobes_optimization(); if (insert_stap_module(& user_credentials) < 0) { if(!rename_mod && errno == EEXIST) err("Rerun with staprun option '-R' to rename this module.\n"); return -1; } rc = init_ctl_channel (modname, 0); if (rc >= 0) { /* If we are unable to send privilege credentials then we have an old (pre 1.7) stap module or a non-stap module. In either case, the privilege credentials required for loading the module have already been determined and checked (see check_groups, get_module_required_credentials). */ send_privilege_credentials(user_credentials); rc = send_relocations(); if (rc == 0) { rc = send_tzinfo(); if (rc == 0 && remote_id >= 0) send_remote_id(); } close_ctl_channel (); } if (rc != 0) remove_module(modname, 1); } return rc; } int main(int argc, char **argv) { int rc; /* Force libc to make our stderr messages atomic by enabling line buffering since stderr is unbuffered by default. Without this, libc is at liberty to split a single stderr message into multiple writes to the fd while holding flockfile(stderr). POSIX only guarantees that a single write(2) is atomic; chaining several write(2) calls together won't be atomic, and we don't want libc to do that within a single *fprintf(stderr) call since it'll mangle messages printed across different processes (*not* threads). */ setlinebuf(stderr); /* NB: Don't do the geteuid()!=0 check here, since we want to test command-line error-handling while running non-root. */ /* Get rid of a few standard environment variables (which */ /* might cause us to do unintended things). */ rc = unsetenv("IFS") || unsetenv("CDPATH") || unsetenv("ENV") || unsetenv("BASH_ENV"); if (rc) { _perr("unsetenv failed"); exit(-1); } if (getuid() != geteuid()) { /* setuid? */ rc = unsetenv("SYSTEMTAP_STAPRUN") || unsetenv("SYSTEMTAP_STAPIO") || unsetenv("SYSTEMTAP_RUNTIME"); if (rc) { _perr("unsetenv failed"); exit(-1); } } setup_signals(); setup_term_signals(); parse_args(argc, argv); /* PR14245, For security reasons, preclude "staprun -F fd". The -F option is only for stapio, but the overzealous quest for commonality doesn't let us express that nicer. */ if (relay_basedir_fd >= 0) { err(_("Relay basedir -F option is invalid for staprun\n")); exit(1); } /* NB: later on, some of our own code may set relay_basedir_fd, for passing onto stapio - or for our own reuse. That's OK. */ if (buffer_size) dbug(2, "Using a buffer of %u MB.\n", buffer_size); int mod_optind = optind; if (optind < argc) { parse_modpath(argv[optind++]); dbug(2, "modpath=\"%s\", modname=\"%s\"\n", modpath, modname); } if (optind < argc) { if (attach_mod) { err("Cannot have module options with attach (-A).\n"); usage(argv[0],1); } else { unsigned start_idx = 0; while (optind < argc && start_idx + 1 < MAXMODOPTIONS) modoptions[start_idx++] = argv[optind++]; modoptions[start_idx] = NULL; } } if (modpath == NULL || *modpath == '\0') { err("Need a module name or path to load.\n"); usage(argv[0],1); } if (geteuid() != 0) { err("The effective user ID of staprun must be set to the root user.\n" " Check permissions on staprun and ensure it is a setuid root program.\n"); exit(1); } char verbose_level[33]; sprintf(verbose_level, "%d", verbose); rc = setenv("SYSTEMTAP_VERBOSE", verbose_level, 0); if (rc) { _perr("SYSTEMTAP_VERBOSE setenv failed"); exit(-1); } if (init_staprun()) exit(1); argv[0] = getenv ("SYSTEMTAP_STAPIO") ?: PKGLIBDIR "/stapio"; /* Copy nenamed modname into argv */ if(rename_mod) argv[mod_optind] = modname; /* PR14245: pass -F fd to stapio. Unfortunately, this requires us to extend argv[], with all the C fun that entails. */ #ifdef HAVE_OPENAT if (relay_basedir_fd >= 0) { char ** new_argv = calloc(argc+2, sizeof(char *)); const int new_Foption_size = 10; /* -FNNNNN */ char * new_Foption = malloc(new_Foption_size); int i; if (new_argv && new_Foption) { snprintf (new_Foption, new_Foption_size, "-F%d", relay_basedir_fd); for (i=0; i < argc && argv[i] != NULL; i++) new_argv[i] = argv[i]; new_argv[i++] = new_Foption; /* overwrite the NULL */ new_argv[i++] = NULL; /* ensconce a new NULL */ argv = new_argv; } } #endif /* Run stapio */ if (run_as (1, getuid(), getgid(), argv[0], argv) < 0) { perror(argv[0]); goto err; } free(modname); return 0; err: remove_module(modname, 1); free(modname); return 1; } /* Send a variety of relocation-related data to the kernel: for the kernel proper, just the "_stext" symbol address; for all loaded modules, a variety of symbol base addresses. We do this under protest. The kernel ought expose this data to modules such as ourselves, but instead the upstream community continually shrinks its module-facing interfaces, including this stuff, even when users exist. PR26074: as of kernel 5.7+ / commit 0bd476e6c671 and under further protest, we must also send the address of kallsyms_lookup_name and kallsyms_for_each_symbol. */ int send_a_relocation (const char* module, const char* reloc, unsigned long long address) { struct _stp_msg_relocation msg; int rc; if (strlen(module) >= STP_MODULE_NAME_LEN-1) { dbug (1, "module name too long: %s\n", module); return -EINVAL; } strncpy (msg.module, module, STP_MODULE_NAME_LEN - 1); if (strlen(reloc) >= STP_SYMBOL_NAME_LEN-1) { dbug (1, "reloc name too long: %s\n", reloc); return -EINVAL; } strncpy (msg.reloc, reloc, STP_MODULE_NAME_LEN - 1); msg.address = address; rc = send_request (STP_RELOCATION, & msg, sizeof (msg)); if (rc != 0) perror ("Unable to send relocation"); return rc; } int send_relocation_kernel () { FILE* kallsyms; int rc = 0; errno = 0; kallsyms = fopen ("/proc/kallsyms", "r"); if (kallsyms == NULL) { perror("cannot open /proc/kallsyms"); // ... and the kernel module will almost certainly fail to initialize. return errno; } else { int found_stext = 0; int found_kallsyms_lookup_name = 0; int found_kallsyms_on_each_symbol = 0; int found_module_kallsyms_on_each_symbol = 0; int done_with_kallsyms = 0; char *line = NULL; size_t linesz = 0; while (! feof(kallsyms) && !done_with_kallsyms) { ssize_t linesize = getline (& line, & linesz, kallsyms); if (linesize > 0) { unsigned long long address; int pos = -1; if (sscanf (line, "%llx %*c %n", &address, &pos) != 1 || pos == -1) continue; // no symbols here if (linesize - pos == sizeof KERNEL_RELOC_SYMBOL && !strcmp(line + pos, KERNEL_RELOC_SYMBOL "\n")) { /* NB: even on ppc, we use the _stext relocation name. */ rc = send_a_relocation ("kernel", "_stext", address); if (rc != 0) break; found_stext=1; } else if (linesize - pos == sizeof "kallsyms_lookup_name" && !strcmp(line + pos, "kallsyms_lookup_name" "\n")) { rc = send_a_relocation ("kernel", "kallsyms_lookup_name", address); if (rc != 0) // non fatal, follows perror() dbug(1, "Relocation was kallsyms_lookup_name=%llx\n", address); found_kallsyms_lookup_name = 1; } else if (linesize - pos == sizeof "kallsyms_on_each_symbol" && !strcmp(line + pos, "kallsyms_on_each_symbol" "\n")) { rc = send_a_relocation ("kernel", "kallsyms_on_each_symbol", address); if (rc != 0) // non fatal, follows perror() dbug(1, "Relocation was reloc kallsyms_on_each_symbol=%llx\n", address); found_kallsyms_on_each_symbol = 1; } else if (linesize - pos == sizeof "module_kallsyms_on_each_symbol" && !strcmp(line + pos, "module_kallsyms_on_each_symbol" "\n")) { rc = send_a_relocation ("kernel", "module_kallsyms_on_each_symbol", address); if (rc != 0) // non fatal, follows perror() dbug(1, "Relocation was reloc module_kallsyms_on_each_symbol=%llx\n", address); found_module_kallsyms_on_each_symbol = 1; } } done_with_kallsyms = found_stext && found_kallsyms_lookup_name && found_kallsyms_on_each_symbol && found_module_kallsyms_on_each_symbol; } free (line); fclose (kallsyms); /* PR26074: Arguably, failure to find the kallsyms_* symbols may * not be a fatal error. The fallback kallsyms_lookup_name() * function in sym.c then returns 0, but it's barely conceivable * some modules never call it. */ /* if (!done_with_kallsyms) */ if (!found_stext) return rc; /* detect note section, send flag if there * NB: address=2 represents existed note, the real one in _stp_module */ if (!access("/sys/kernel/notes", R_OK)) rc = send_a_relocation ("kernel", ".note.gnu.build-id", 2); } return rc; } int send_relocation_modules () { unsigned i = 0; glob_t globbuf; globbuf.gl_pathc = 0; int r = glob("/sys/module/*/sections/*", GLOB_PERIOD, NULL, &globbuf); if (r == GLOB_NOSPACE || r == GLOB_ABORTED) return r; for (i=0; itm_gmtoff; strncpy (tzi.tz_name, now->tm_zone, STP_TZ_NAME_LEN - 1); rc = send_request(STP_TZINFO, & tzi, sizeof(tzi)); if (rc != 0) perror ("Unable to send time zone information"); return rc; } int send_privilege_credentials (privilege_t user_credentials) { struct _stp_msg_privilege_credentials pc; int rc; pc.pc_group_mask = user_credentials; rc = send_request(STP_PRIVILEGE_CREDENTIALS, & pc, sizeof(pc)); if (rc != 0) { /* Not an error. Happens when pre 1.7 modules are loaded. */ dbug (1, "Unable to send user privilege credentials\n"); } return rc; } int send_remote_id () { struct _stp_msg_remote_id rem; int rc; rem.remote_id = remote_id; strncpy (rem.remote_uri, remote_uri, STP_REMOTE_URI_LEN - 1); rem.remote_uri [STP_REMOTE_URI_LEN-1]='\0'; /* XXX: quietly truncate */ rc = send_request(STP_REMOTE_ID, & rem, sizeof(rem)); if (rc != 0) perror ("Unable to send remote id"); return rc; }