From 1d3e6fd902d0daded31ff457a1650e9a47528982 Mon Sep 17 00:00:00 2001 From: Sagar Patel Date: Thu, 8 Aug 2019 15:40:10 -0400 Subject: [PATCH] PR23285 (1): enable procfs probes for stapbpf The eBPF backend now supports procfs probes. This implementation uses FIFO special files instead of proc filesystem files. The file path format used is /var/tmp/systemtap-USER/MODNAME. One limitation is that both read and write probes cannot exist for the same file. 1) Added procfs probe data structures to hold probe information. 2) Created an interface between target variables and eBPF interpreter. 3) Dedicated a single thread for each file which monitors for I/O. 4) Developed a cleaning routine and error handling mechanisms. 5) Updated NEWS and man pages. small fix new procfs_bpf test --- NEWS | 4 + bpf-internal.h | 9 +- bpf-translate.cxx | 30 ++- elaborate.cxx | 2 +- man/stapprobes.3stap | 15 +- stapbpf/Makefile.am | 2 +- stapbpf/Makefile.in | 43 +++- stapbpf/bpfinterp.cxx | 43 +++- stapbpf/bpfinterp.h | 4 + stapbpf/stapbpf.8 | 1 + stapbpf/stapbpf.cxx | 276 +++++++++++++++++++++++- tapset-procfs.cxx | 80 ++++++- tapset/bpf/target.stp | 51 +++++ tapsets.h | 3 + testsuite/systemtap.base/procfs_bpf.exp | 118 ++++++++++ 15 files changed, 650 insertions(+), 31 deletions(-) create mode 100644 tapset/bpf/target.stp create mode 100644 testsuite/systemtap.base/procfs_bpf.exp diff --git a/NEWS b/NEWS index 3f671fda7..07a5a44dc 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,9 @@ * What's new in version 4.2, PRERELEASE +- The stapbpf backend now supports procfs probes. The implementation + uses FIFO special files in /var/tmp/systemtap-USER/MODNAME instead + of the proc filesystem files. + - The eBPF backend now uses bpf raw tracepoints for kernel.trace("*") probes. These have target variable arguments that match the arguments available for the traditional linux kernel modules diff --git a/bpf-internal.h b/bpf-internal.h index ef8d784f9..1b6f4a6ac 100644 --- a/bpf-internal.h +++ b/bpf-internal.h @@ -162,13 +162,18 @@ const opcode BPF_LD_MAP = BPF_LD | BPF_IMM | BPF_DW | (BPF_PSEUDO_MAP_FD << 8); FN(sprintf), \ FN(stapbpf_stat_get), \ FN(gettimeofday_ns), \ - FN(get_target), + FN(set_procfs_value), \ + FN(append_procfs_value), \ + FN(get_procfs_value), + const bpf_func_id BPF_FUNC_map_get_next_key = (bpf_func_id) -1; const bpf_func_id BPF_FUNC_sprintf = (bpf_func_id) -2; const bpf_func_id BPF_FUNC_stapbpf_stat_get = (bpf_func_id) -3; const bpf_func_id BPF_FUNC_gettimeofday_ns = (bpf_func_id) -4; const bpf_func_id BPF_FUNC_get_target = (bpf_func_id) -5; - +const bpf_func_id BPF_FUNC_set_procfs_value = (bpf_func_id) -6; +const bpf_func_id BPF_FUNC_append_procfs_value = (bpf_func_id) -7; +const bpf_func_id BPF_FUNC_get_procfs_value = (bpf_func_id) -8; struct insn { diff --git a/bpf-translate.cxx b/bpf-translate.cxx index 720e23d4e..e47b814b3 100644 --- a/bpf-translate.cxx +++ b/bpf-translate.cxx @@ -17,6 +17,8 @@ #include #include #include +#include +#include extern "C" { #include @@ -731,8 +733,8 @@ bpf_unparser::emit_store(expression *e, value *val) emit_mov(this_prog.lookup_reg(BPF_REG_4), this_prog.new_imm(0)); this_prog.mk_call(this_ins, BPF_FUNC_map_update_elem, 4); return; - } - } + } + } err: throw SEMANTIC_ERROR (_("unknown lvalue"), e->tok); } @@ -2368,17 +2370,11 @@ bpf_unparser::emit_context_var(bpf_context_vardecl *v) } void -bpf_unparser::visit_symbol (symbol *s) +bpf_unparser::visit_symbol(symbol *s) { vardecl *v = s->referent; assert (v->arity < 1); - if (bpf_context_vardecl *c = dynamic_cast(v)) - { - result = emit_context_var(c); - return; - } - auto g = glob.globals.find (v); if (g != glob.globals.end()) { @@ -4108,6 +4104,7 @@ translate_probe(program &prog, globals &glob, derived_probe *dp) u.add_prologue(); dp->body->visit (&u); + if (u.in_block()) u.emit_jmp(u.get_ret0_block()); } @@ -4422,6 +4419,21 @@ translate_bpf_pass (systemtap_session& s) } } + if (s.procfs_derived_probes) + { + sort_for_bpf_probe_arg_vector procfs_v; + sort_for_bpf(s, s.procfs_derived_probes, procfs_v); + + for (auto i = procfs_v.begin(); i != procfs_v.end(); ++i) + { + t = i->first->tok; + program p(target_user_bpfinterp); + translate_probe(p, glob, i->first); + p.generate(); + output_probe(eo, p, i->second, 0); + } + } + if (s.perf_derived_probes) { sort_for_bpf_probe_arg_vector perf_v; diff --git a/elaborate.cxx b/elaborate.cxx index 3a0bb2397..1a4ce2aa5 100644 --- a/elaborate.cxx +++ b/elaborate.cxx @@ -6610,7 +6610,7 @@ typeresolution_info::visit_target_symbol (target_symbol* e) if (e->saved_conversion_error) session.print_error (* (e->saved_conversion_error)); - else + else session.print_error (SEMANTIC_ERROR(_("unresolved target-symbol expression"), e->tok)); } diff --git a/man/stapprobes.3stap b/man/stapprobes.3stap index f9ee7dacd..665f4a8c4 100644 --- a/man/stapprobes.3stap +++ b/man/stapprobes.3stap @@ -1091,8 +1091,17 @@ procfs.write procfs.umask(UMASK).write .ESAMPLE +Note that there are a few differences when procfs probes are used in the stapbpf runtime. +.RI FIFO +special files are used instead of proc filesystem files. +These files are created in +/var/tmp/systemtap-USER/MODNAME. +.RI (USER +is the name of the user). +Additionally, users cannot create both read and write probes on the same file. + .I PATH -is the file name (relative to /proc/systemtap/MODNAME) to be created. +is the file name (relative to /proc/systemtap/MODNAME or /var/tmp/systemtap-USER/MODNAME) to be created. If no .I PATH is specified (as in the last two variants above), @@ -1102,7 +1111,7 @@ for input probes and should not be used as a .I PATH for procfs probes; see the input probe section below. .PP -When a user reads /proc/systemtap/MODNAME/PATH, the corresponding +When a user reads /proc/systemtap/MODNAME/PATH (normal runtime) or /var/tmp/systemtap-USER/MODNAME (stapbpf runtime), the corresponding procfs .I read probe is triggered. The string data to be read should be assigned to @@ -1114,7 +1123,7 @@ like this: procfs("PATH").read { $value = "100\\n" } .ESAMPLE .PP -When a user writes into /proc/systemtap/MODNAME/PATH, the +When a user writes into /proc/systemtap/MODNAME/PATH (normal runtime) or /var/tmp/systemtap-USER/MODNAME (stapbpf runtime), the corresponding procfs .I write probe is triggered. The data the user wrote is available in the diff --git a/stapbpf/Makefile.am b/stapbpf/Makefile.am index ba1869765..dc2795e6c 100644 --- a/stapbpf/Makefile.am +++ b/stapbpf/Makefile.am @@ -18,7 +18,7 @@ bin_PROGRAMS = stapbpf man_MANS = stapbpf.8 -stapbpf_SOURCES = stapbpf.cxx bpfinterp.cxx libbpf.c +stapbpf_SOURCES = stapbpf.cxx bpfinterp.cxx libbpf.c ../util.cxx stapbpf_CPPFLAGS = $(AM_CPPFLAGS) stapbpf_CFLAGS = $(AM_CFLAGS) stapbpf_CXXFLAGS = $(AM_CXXFLAGS) diff --git a/stapbpf/Makefile.in b/stapbpf/Makefile.in index e3a3146e7..49bf65b80 100644 --- a/stapbpf/Makefile.in +++ b/stapbpf/Makefile.in @@ -108,9 +108,11 @@ CONFIG_CLEAN_FILES = CONFIG_CLEAN_VPATH_FILES = am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man8dir)" PROGRAMS = $(bin_PROGRAMS) +am__dirstamp = $(am__leading_dot)dirstamp @HAVE_BPF_DECLS_TRUE@am_stapbpf_OBJECTS = stapbpf-stapbpf.$(OBJEXT) \ @HAVE_BPF_DECLS_TRUE@ stapbpf-bpfinterp.$(OBJEXT) \ -@HAVE_BPF_DECLS_TRUE@ stapbpf-libbpf.$(OBJEXT) +@HAVE_BPF_DECLS_TRUE@ stapbpf-libbpf.$(OBJEXT) \ +@HAVE_BPF_DECLS_TRUE@ ../stapbpf-util.$(OBJEXT) stapbpf_OBJECTS = $(am_stapbpf_OBJECTS) am__DEPENDENCIES_1 = @HAVE_BPF_DECLS_TRUE@stapbpf_DEPENDENCIES = $(am__DEPENDENCIES_1) @@ -131,8 +133,9 @@ am__v_at_1 = DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) depcomp = $(SHELL) $(top_srcdir)/depcomp am__maybe_remake_depfiles = depfiles -am__depfiles_remade = ./$(DEPDIR)/stapbpf-bpfinterp.Po \ - ./$(DEPDIR)/stapbpf-libbpf.Po ./$(DEPDIR)/stapbpf-stapbpf.Po +am__depfiles_remade = ../$(DEPDIR)/stapbpf-util.Po \ + ./$(DEPDIR)/stapbpf-bpfinterp.Po ./$(DEPDIR)/stapbpf-libbpf.Po \ + ./$(DEPDIR)/stapbpf-stapbpf.Po am__mv = mv -f AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -431,7 +434,7 @@ AM_CPPFLAGS = -D_GNU_SOURCE -I$(srcdir)/../includes \ -DPKGLIBDIR='"$(pkglibexecdir)"' -DLOCALEDIR='"$(localedir)"' AM_LDFLAGS = @PIELDFLAGS@ @HAVE_BPF_DECLS_TRUE@man_MANS = stapbpf.8 -@HAVE_BPF_DECLS_TRUE@stapbpf_SOURCES = stapbpf.cxx bpfinterp.cxx libbpf.c +@HAVE_BPF_DECLS_TRUE@stapbpf_SOURCES = stapbpf.cxx bpfinterp.cxx libbpf.c ../util.cxx @HAVE_BPF_DECLS_TRUE@stapbpf_CPPFLAGS = $(AM_CPPFLAGS) $(am__append_1) @HAVE_BPF_DECLS_TRUE@stapbpf_CFLAGS = $(AM_CFLAGS) @HAVE_BPF_DECLS_TRUE@stapbpf_CXXFLAGS = $(AM_CXXFLAGS) @@ -517,6 +520,14 @@ uninstall-binPROGRAMS: clean-binPROGRAMS: -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) +../$(am__dirstamp): + @$(MKDIR_P) .. + @: > ../$(am__dirstamp) +../$(DEPDIR)/$(am__dirstamp): + @$(MKDIR_P) ../$(DEPDIR) + @: > ../$(DEPDIR)/$(am__dirstamp) +../stapbpf-util.$(OBJEXT): ../$(am__dirstamp) \ + ../$(DEPDIR)/$(am__dirstamp) stapbpf$(EXEEXT): $(stapbpf_OBJECTS) $(stapbpf_DEPENDENCIES) $(EXTRA_stapbpf_DEPENDENCIES) @rm -f stapbpf$(EXEEXT) @@ -524,10 +535,12 @@ stapbpf$(EXEEXT): $(stapbpf_OBJECTS) $(stapbpf_DEPENDENCIES) $(EXTRA_stapbpf_DEP mostlyclean-compile: -rm -f *.$(OBJEXT) + -rm -f ../*.$(OBJEXT) distclean-compile: -rm -f *.tab.c +@AMDEP_TRUE@@am__include@ @am__quote@../$(DEPDIR)/stapbpf-util.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stapbpf-bpfinterp.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stapbpf-libbpf.Po@am__quote@ # am--include-marker @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stapbpf-stapbpf.Po@am__quote@ # am--include-marker @@ -611,6 +624,20 @@ stapbpf-bpfinterp.obj: bpfinterp.cxx @AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='bpfinterp.cxx' object='stapbpf-bpfinterp.obj' libtool=no @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(stapbpf_CPPFLAGS) $(CPPFLAGS) $(stapbpf_CXXFLAGS) $(CXXFLAGS) -c -o stapbpf-bpfinterp.obj `if test -f 'bpfinterp.cxx'; then $(CYGPATH_W) 'bpfinterp.cxx'; else $(CYGPATH_W) '$(srcdir)/bpfinterp.cxx'; fi` + +../stapbpf-util.o: ../util.cxx +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(stapbpf_CPPFLAGS) $(CPPFLAGS) $(stapbpf_CXXFLAGS) $(CXXFLAGS) -MT ../stapbpf-util.o -MD -MP -MF ../$(DEPDIR)/stapbpf-util.Tpo -c -o ../stapbpf-util.o `test -f '../util.cxx' || echo '$(srcdir)/'`../util.cxx +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) ../$(DEPDIR)/stapbpf-util.Tpo ../$(DEPDIR)/stapbpf-util.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='../util.cxx' object='../stapbpf-util.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(stapbpf_CPPFLAGS) $(CPPFLAGS) $(stapbpf_CXXFLAGS) $(CXXFLAGS) -c -o ../stapbpf-util.o `test -f '../util.cxx' || echo '$(srcdir)/'`../util.cxx + +../stapbpf-util.obj: ../util.cxx +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(stapbpf_CPPFLAGS) $(CPPFLAGS) $(stapbpf_CXXFLAGS) $(CXXFLAGS) -MT ../stapbpf-util.obj -MD -MP -MF ../$(DEPDIR)/stapbpf-util.Tpo -c -o ../stapbpf-util.obj `if test -f '../util.cxx'; then $(CYGPATH_W) '../util.cxx'; else $(CYGPATH_W) '$(srcdir)/../util.cxx'; fi` +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) ../$(DEPDIR)/stapbpf-util.Tpo ../$(DEPDIR)/stapbpf-util.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='../util.cxx' object='../stapbpf-util.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(stapbpf_CPPFLAGS) $(CPPFLAGS) $(stapbpf_CXXFLAGS) $(CXXFLAGS) -c -o ../stapbpf-util.obj `if test -f '../util.cxx'; then $(CYGPATH_W) '../util.cxx'; else $(CYGPATH_W) '$(srcdir)/../util.cxx'; fi` install-man8: $(man_MANS) @$(NORMAL_INSTALL) @list1=''; \ @@ -742,6 +769,8 @@ clean-generic: distclean-generic: -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + -rm -f ../$(DEPDIR)/$(am__dirstamp) + -rm -f ../$(am__dirstamp) maintainer-clean-generic: @echo "This command is intended for maintainers to use" @@ -753,7 +782,8 @@ clean: clean-am clean-am: clean-binPROGRAMS clean-generic mostlyclean-am distclean: distclean-am - -rm -f ./$(DEPDIR)/stapbpf-bpfinterp.Po + -rm -f ../$(DEPDIR)/stapbpf-util.Po + -rm -f ./$(DEPDIR)/stapbpf-bpfinterp.Po -rm -f ./$(DEPDIR)/stapbpf-libbpf.Po -rm -f ./$(DEPDIR)/stapbpf-stapbpf.Po -rm -f Makefile @@ -802,7 +832,8 @@ install-ps-am: installcheck-am: maintainer-clean: maintainer-clean-am - -rm -f ./$(DEPDIR)/stapbpf-bpfinterp.Po + -rm -f ../$(DEPDIR)/stapbpf-util.Po + -rm -f ./$(DEPDIR)/stapbpf-bpfinterp.Po -rm -f ./$(DEPDIR)/stapbpf-libbpf.Po -rm -f ./$(DEPDIR)/stapbpf-stapbpf.Po -rm -f Makefile diff --git a/stapbpf/bpfinterp.cxx b/stapbpf/bpfinterp.cxx index faa2ab609..48069584a 100644 --- a/stapbpf/bpfinterp.cxx +++ b/stapbpf/bpfinterp.cxx @@ -28,6 +28,7 @@ #include "bpfinterp.h" #include "libbpf.h" #include "../bpf-internal.h" +#include "../util.h" #define stapbpf_abort(reason) \ ({ fprintf(stderr, _("bpfinterp.cxx:%d: %s\n"), \ @@ -359,6 +360,32 @@ bpf_get_target() return target_pid; } +uint64_t +bpf_set_procfs_value(char* msg, bpf_transport_context* ctx) +{ + assert(msg != nullptr); + + ctx->procfs_msg = std::string(msg); + + return 0; +} + +uint64_t +bpf_append_procfs_value(char* msg, bpf_transport_context* ctx) +{ + assert(msg != nullptr); + + ctx->procfs_msg.append(std::string(msg)); + + return 0; +} + +uint64_t +bpf_get_procfs_value(bpf_transport_context* ctx) +{ + return (uint64_t) (ctx->procfs_msg.data()); +} + enum bpf_perf_event_ret bpf_handle_transport_msg(void *buf, size_t size, bpf_transport_context *ctx) @@ -479,7 +506,11 @@ bpf_interpret(size_t ninsns, const struct bpf_insn insns[], memset(regs, 0x0, sizeof(uint64_t) * MAX_BPF_REG); const struct bpf_insn *i = insns; static std::vector map_values; - static std::vector strings; // TODO: could clear on exit? + + // Multiple threads accessing strings can cause concurrency issues for + // procfs_probes. However, the procfs_lock should prevent this and thus, + // clearing it on exit is unecessary for now. + static std::vector strings; bpf_map_def *map_attrs = ctx->map_attrs; std::vector &map_fds = *ctx->map_fds; @@ -752,6 +783,15 @@ bpf_interpret(size_t ninsns, const struct bpf_insn insns[], case bpf::BPF_FUNC_get_target: dr = bpf_get_target(); break; + case bpf::BPF_FUNC_set_procfs_value: + dr = bpf_set_procfs_value(as_str(regs[1]), ctx); + break; + case bpf::BPF_FUNC_append_procfs_value: + dr = bpf_append_procfs_value(as_str(regs[1]), ctx); + break; + case bpf::BPF_FUNC_get_procfs_value: + dr = bpf_get_procfs_value(ctx); + break; default: stapbpf_abort("unknown helper function"); } @@ -780,5 +820,6 @@ bpf_interpret(size_t ninsns, const struct bpf_insn insns[], for (uint64_t *ptr : map_values) free(ptr); map_values.clear(); // XXX: avoid double free + return result; } diff --git a/stapbpf/bpfinterp.h b/stapbpf/bpfinterp.h index 42c9cbf5d..732fc25fe 100644 --- a/stapbpf/bpfinterp.h +++ b/stapbpf/bpfinterp.h @@ -46,6 +46,10 @@ struct bpf_transport_context { std::vector *interned_strings; std::unordered_map *aggregates; // XXX: Could be refactored into a single global struct bpf_global_context. + + // Data for procfs probes. Multiple threads will be accessing this variable. + // However, the procfs_lock should prevent any concurrency issues. + std::string procfs_msg; // Data for an in-progress printf request: bool in_printf; diff --git a/stapbpf/stapbpf.8 b/stapbpf/stapbpf.8 index 9daf5b243..3e7163637 100644 --- a/stapbpf/stapbpf.8 +++ b/stapbpf/stapbpf.8 @@ -129,6 +129,7 @@ kernel.* process.* timer.* perf.* +procfs.* .ESAMPLE In general, probes based on the kprobes, uprobes, tracepoint and perf diff --git a/stapbpf/stapbpf.cxx b/stapbpf/stapbpf.cxx index a8d6c8ee3..4f6530f4f 100644 --- a/stapbpf/stapbpf.cxx +++ b/stapbpf/stapbpf.cxx @@ -29,17 +29,20 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include #include #include "bpfinterp.h" +#include "../util.h" extern "C" { #include @@ -80,10 +83,17 @@ static const char *module_name; static const char *module_basename; static const char *script_name; // name of original systemtap script static const char *module_license; + +static const char *user; // username +static std::string prefix; // used to create procfs-like probe directory + static Elf *module_elf; static uint32_t kernel_version; +// Locks for accessing procfs-like probe messages +std::mutex procfs_lock; + // Sized by the contents of the "maps" section. static bpf_map_def *map_attrs; static std::vector map_fds; @@ -132,6 +142,27 @@ static Elf_Data *prog_end; static void unregister_kprobes(const size_t nprobes); +struct procfsprobe_data +{ + std::string path; + uint64_t umask; + char type; // either 'r' (read) or 'w' (write) + uint64_t maxsize_val; + Elf_Data* read_prog; + std::vector write_prog; + + // ctor for read probes + procfsprobe_data(string path, uint64_t umask, char type, uint64_t maxsize_val, Elf_Data* prog) + : path(path), umask(umask), type(type), maxsize_val(maxsize_val), read_prog(prog) + { assert (type == 'r'); } + + // ctor for write probes + procfsprobe_data(string path, uint64_t umask, char type, uint64_t maxsize_val, std::vector prog) + : path(path), umask(umask), type(type), maxsize_val(maxsize_val), write_prog(prog) + { assert (type == 'w'); } +}; + + struct kprobe_data { string args; @@ -200,6 +231,7 @@ struct trace_data { } }; +static std::vector procfsprobes; static std::vector kprobes; static std::vector timers; static std::vector perf_probes; @@ -235,7 +267,6 @@ fatal_elf() fatal("%s\n", elf_errmsg(-1)); } - // XXX: based on get_online_cpus()/read_cpu_range() // in bcc src/cc/common.cc // @@ -587,6 +618,39 @@ maybe_collect_kprobe(const char *name, unsigned name_idx, kprobes.push_back(kprobe_data(type, arg, fd)); } +static void +collect_procfsprobe(const char *name, Elf_Data* prog) +{ + uint64_t umask; + uint64_t maxsize_val; + char type; + char fifoname[PATH_MAX]; + + int res = sscanf(name, "procfsprobe/%lu/%c/%lu/%s", &umask, &type, &maxsize_val, fifoname); + + if (res != 4) + fatal("unable to parse name of probe: %s", name); + + std::string path(fifoname); + + if (type == 'r') + procfsprobes.push_back(procfsprobe_data(path, umask, type, maxsize_val, prog)); + else + { + // Check if a write probe with the same path already exists + for (unsigned i = 0; i < procfsprobes.size(); i++) + if (procfsprobes[i].path == string(path) && procfsprobes[i].type == 'w') + { + procfsprobes[i].write_prog.push_back(prog); + return; + } + + std::vector progs; + progs.push_back(prog); + procfsprobes.push_back(procfsprobe_data(path, umask, type, maxsize_val, progs)); + } +} + static void collect_uprobe(const char *name, unsigned name_idx, unsigned fd_idx) { @@ -1342,6 +1406,19 @@ load_bpf_file(const char *module) if (ehdr == NULL) fatal_elf(); + /* Get username and set directory prefix: */ + user = getlogin(); + + if (!user) + fatal("an error occured while retrieving username. %s.\n", strerror(errno)); + + // TODO: fix script_name so we can directly use it here + + std::string module_name = std::string(module_basename); + module_name = module_name.substr(0, module_name.size() - 3); + + prefix = "/var/tmp/systemtap-" + std::string(user) + "/" + module_name + "/"; + // Byte order should match the host, since we're loading locally. { const char *end_str; @@ -1388,6 +1465,8 @@ load_bpf_file(const char *module) unsigned begin_idx = 0; unsigned end_idx = 0; + std::vector procfsprobes_idx; + // First pass to identify special sections, and make sure // all data is readable. for (unsigned i = 1; i < shnum; ++i) @@ -1434,6 +1513,11 @@ load_bpf_file(const char *module) begin_idx = i; else if (strcmp(shname, "stap_end") == 0) end_idx = i; + else if (strncmp(shname, "procfs", strlen("procfs")) == 0) { + // procfs probes have a "procfs" prefix in their names, we don't + // use normal strcmp as the full shname includes args + procfsprobes_idx.push_back(i); + } } // Two special sections are not optional. @@ -1445,6 +1529,7 @@ load_bpf_file(const char *module) script_name = static_cast(sh_data[script_name_idx]->d_buf); else script_name = ""; + if (version_idx != 0) { unsigned long long size = shdrs[version_idx]->sh_size; @@ -1535,7 +1620,7 @@ load_bpf_file(const char *module) prog_fds[i] = prog_load(sh_data[i], sh_name[i]); } - // Remember begin and end probes. + // Remember begin, end and procfs-like probes. if (begin_idx) { Elf64_Shdr *shdr = shdrs[begin_idx]; @@ -1549,6 +1634,15 @@ load_bpf_file(const char *module) prog_end = sh_data[end_idx]; } + for (unsigned i = 0; i < procfsprobes_idx.size(); ++i) + { + unsigned actual_idx = procfsprobes_idx[i]; + + Elf64_Shdr *shdr = shdrs[actual_idx]; + if (shdr->sh_flags & SHF_EXECINSTR) + collect_procfsprobe(sh_name[actual_idx], sh_data[actual_idx]); + } + // Record all kprobes. if (kprobes_idx != 0) { @@ -1737,6 +1831,178 @@ perf_event_loop(pthread_t main_thread) return; } + +static void +procfs_read_event_loop (procfsprobe_data* data, bpf_transport_context* uctx) +{ + std::string path_s = prefix + data->path; + const char* path = path_s.c_str(); + + Elf_Data* prog = data->read_prog; + + while (true) + { + int fd = open(path, O_WRONLY); + + if (fd == -1) + { + if (errno == ENOENT) + fatal("an error occured while opening procfs fifo (%s). %s.\n", path, strerror(errno)); + + fprintf(stderr, "WARNING: an error occurred while opening procfs fifo (%s). %s.\n", + path, strerror(errno)); + continue; + } + + procfs_lock.lock(); + + // Run the probe and collect the message. + bpf_interpret(prog->d_size / sizeof(bpf_insn), static_cast(prog->d_buf), uctx); + + // Make a copy of the message. + std::string msg = uctx->procfs_msg; + + procfs_lock.unlock(); + + if (data->maxsize_val && (msg.size() > data->maxsize_val - 1)) + fprintf(stderr, "WARNING: procfs message size (%ld) exceeds specified maximum size (%ld).\n", + msg.size() + 1, data->maxsize_val); + + if (write(fd, msg.c_str(), msg.size() + 1) == -1) + { + fprintf(stderr, "WARNING: an error occurred while writing to procfs fifo (%s). %s.\n", + path, strerror(errno)); + (void) close(fd); + continue; + } + + (void) close(fd); + + // We're not sure at this point whether the read end of the pipe has closed. We + // perform a small open hack to spin until read end of the pipe has closed. + + do { + + fd = open(path, O_WRONLY | O_NONBLOCK); + + if (fd != -1) close(fd); + + } while (fd != -1); + } +} + + +static void +procfs_write_event_loop (procfsprobe_data* data, bpf_transport_context* uctx) +{ + std::string path_s = prefix + data->path; + const char* path = path_s.c_str(); + + std::vector prog = data->write_prog; + + while (true) + { + int fd = open(path, O_RDONLY); + + if (fd == -1) + { + if (errno == ENOENT) + fatal("an error occured while opening procfs fifo (%s). %s.\n", path, strerror(errno)); + + fprintf(stderr, "WARNING: an error occurred while opening procfs fifo (%s). %s.\n", + path, strerror(errno)); + continue; + } + + std::string msg; + + unsigned read_size = 1024; + int bytes_read; + + do { + + char buffer_feed[read_size]; + bytes_read = read(fd, buffer_feed, read_size); + + if (bytes_read == -1) + fprintf(stderr, "WARNING: an error occurred while reading from procfs fifo (%s). %s.\n", + path, strerror(errno)); + + if (bytes_read > 0) + msg.append(std::string(buffer_feed)); + + } while (bytes_read > 0); + + (void) close(fd); + + procfs_lock.lock(); + + uctx->procfs_msg = msg; + + // Now that we have the message, run the probes serially. + for (unsigned i = 0; i < prog.size(); ++i) + bpf_interpret(prog[i]->d_size / sizeof(bpf_insn), static_cast(prog[i]->d_buf), uctx); + + procfs_lock.unlock(); + } +} + + +static void +procfs_cleanup() +{ + // Delete files and directories created for procfs-like probes. + for (size_t k = 0; k < procfsprobes.size(); ++k) + { + std::string file_s = prefix + procfsprobes[k].path; + const char* file = file_s.c_str(); + if (remove_file_or_dir(file)) + fprintf(stderr, "WARNING: an error occurred while deleting a file (%s). %s.\n", file, strerror(errno)); + } + + const char* dir = prefix.c_str(); + if (procfsprobes.size() > 0 && remove_file_or_dir(dir)) + fprintf(stderr, "WARNING: an error ocurred while deleting a directory (%s). %s.\n", dir, strerror(errno)); +} + + +static void +procfs_spawn(bpf_transport_context* uctx) +{ + // Enable cleanup routine. + if (atexit(procfs_cleanup)) + fatal("an error occurred while setting up procfs cleaner. %s.\n", strerror(errno)); + + // Create directory for procfs-like probes. + if (procfsprobes.size() > 0 && create_dir(prefix.c_str())) + fatal("an error occurred while making procfs directory. %s.\n", strerror(errno)); + + // Create all of the fifos used for procfs-like probes and spawn threads. + for (size_t k =0; k < procfsprobes.size(); ++k) + { + procfsprobe_data* data = &procfsprobes[k]; + + std::string path = prefix + data->path; + + uint64_t cmask = umask(data->umask); + + mode_t mode = (data->type == 'r') ? 0444 : 0222; + + if ((mkfifo(path.c_str(), mode) == -1)) + fatal("an error occured while making procfs fifos. %s.\n", strerror(errno)); + + // TODO: Could set the owner/group of the fifo to the effective user. + + umask(cmask); + + if (data->type == 'r') + std::thread(procfs_read_event_loop, data, uctx).detach(); + else + std::thread(procfs_write_event_loop, data, uctx).detach(); + } +} + + static void usage(const char *argv0) { @@ -1868,6 +2134,9 @@ main(int argc, char **argv) // PR22330: Listen for perf_events: std::thread(perf_event_loop, pthread_self()).detach(); + // Spawn all procfs threads. + procfs_spawn(&uctx); + // Now that the begin probe has run and the perf_event listener is active, enable the kprobes. ioctl(group_fd, PERF_EVENT_IOC_ENABLE, 0); @@ -1887,6 +2156,9 @@ main(int argc, char **argv) unregister_tracepoints(tracepoint_probes.size()); unregister_raw_tracepoints(raw_tracepoint_probes.size()); + // Clean procfs-like probe files. + procfs_cleanup(); + // We are now running exit probes, so ^C should exit immediately: exit_phase = 1; signal(SIGINT, (sighandler_t)sigint); // restore previously ignored signal diff --git a/tapset-procfs.cxx b/tapset-procfs.cxx index 49ddb1814..73719e873 100644 --- a/tapset-procfs.cxx +++ b/tapset-procfs.cxx @@ -41,7 +41,6 @@ struct procfs_derived_probe: public derived_probe int64_t umask; string variable_name; - procfs_derived_probe (systemtap_session &, probe* p, probe_point* l, string ps, bool w, int64_t m, int64_t umask); void join_group (systemtap_session& s); @@ -62,6 +61,10 @@ struct procfs_probe_set struct procfs_derived_probe_group: public generic_dpg { + friend bool sort_for_bpf(systemtap_session& s, + procfs_derived_probe_group *pr, + sort_for_bpf_probe_arg_vector &v); + private: map probes_by_path; typedef map::iterator p_b_p_iterator; @@ -72,7 +75,7 @@ public: procfs_derived_probe_group () : has_read_probes(false), has_write_probes(false) {} - void enroll (procfs_derived_probe* probe); + void enroll (procfs_derived_probe* probe, systemtap_session& s); void emit_kernel_module_init (systemtap_session& s); void emit_kernel_module_exit (systemtap_session& s); void emit_module_decls (systemtap_session& s); @@ -80,6 +83,37 @@ public: void emit_module_exit (systemtap_session& s); }; +bool +sort_for_bpf(systemtap_session& s __attribute__ ((unused)), + procfs_derived_probe_group *pr, + sort_for_bpf_probe_arg_vector &v) +{ + if (!pr) + return false; + + for (auto i = pr->probes_by_path.begin(); i != pr->probes_by_path.end(); ++i) + { + procfs_derived_probe *read_probe = i->second->read_probe; + + if (read_probe) + { + stringstream s; + s << "procfsprobe/" << read_probe->umask << "/r/" << read_probe->maxsize_val << "/" << i->first; + v.push_back(std::pair (read_probe, s.str())); + } + + vector write_probes = i->second->write_probes; + + for (auto j = write_probes.begin(); j != write_probes.end(); j++) + { + stringstream s; + s << "procfsprobe/" << (*j)->umask << "/w/" << (*j)->maxsize_val << "/" << i->first; + v.push_back(std::pair (*j, s.str())); + } + } + + return true; +} struct procfs_var_expanding_visitor: public var_expanding_visitor { @@ -121,7 +155,7 @@ procfs_derived_probe::join_group (systemtap_session& s) ec->code = string("#include \"procfs-probes.h\""); s.embeds.push_back(ec); } - s.procfs_derived_probes->enroll (this); + s.procfs_derived_probes->enroll (this, s); this->group = s.procfs_derived_probes; } @@ -136,7 +170,7 @@ procfs_derived_probe::use_internal_buffer(const std::string& var) void -procfs_derived_probe_group::enroll (procfs_derived_probe* p) +procfs_derived_probe_group::enroll (procfs_derived_probe* p, systemtap_session& s) { procfs_probe_set *pset; @@ -149,6 +183,12 @@ procfs_derived_probe_group::enroll (procfs_derived_probe* p) { pset = probes_by_path[p->path]; + // You can't have read and write probes for the same path in the bpf runtime. + if (s.runtime_mode == systemtap_session::bpf_runtime && + ((p->write && pset->read_probe) || (! p->write && pset->write_probes.size() > 0))) + throw SEMANTIC_ERROR(_("both read and write procfs probes cannot exist for the same procfs path \"") + + p->path + "\" in the bpf runtime."); + // You can only specify 1 read probe. if (! p->write && pset->read_probe != NULL) throw SEMANTIC_ERROR(_("only one read procfs probe can exist for procfs path \"") + p->path + "\""); @@ -491,14 +531,40 @@ procfs_var_expanding_visitor::visit_target_symbol (target_symbol* e) bool lvalue = is_active_lvalue(e); if (write_probe && lvalue) throw SEMANTIC_ERROR(_("procfs $value variable is read-only in a procfs write probe"), e->tok); - else if (! write_probe && ! lvalue) - throw SEMANTIC_ERROR(_("procfs $value variable cannot be read in a procfs read probe"), e->tok); + else if (! write_probe && ! lvalue) + throw SEMANTIC_ERROR(_("procfs $value variable cannot be read in a procfs read probe"), e->tok); if (e->addressof) throw SEMANTIC_ERROR(_("cannot take address of procfs variable"), e->tok); // Remember that we've seen a target variable. target_symbol_seen = true; + + // If we're in the bpf runtime, we simply replace the target variable with helper + // functions in the tapset library which will act as an interfacing mechanism. + if (sess.runtime_mode == systemtap_session::bpf_runtime) + { + functioncall* n = new functioncall; + n->tok = e->tok; + + if (!lvalue) + n->function = "_get_procfs_value"; + else + { + if (*op == "=") + n->function = "_set_procfs_value"; + else if (*op == ".=") + n->function = "_append_procfs_value"; + else + throw SEMANTIC_ERROR (_("Only the following assign operators are" + " implemented on procfs read target variables:" + " '=', '.='"), e->tok); + provide_lvalue_call (n); + } + + provide (n); + return; + } // Synthesize a function. functiondecl *fdecl = new functiondecl; @@ -674,6 +740,8 @@ procfs_builder::build(systemtap_session & sess, throw SEMANTIC_ERROR (_("procfs path cannot be relative (and contain '.' or '..')"), location->components.front()->tok); } + + if (!(has_read ^ has_write)) throw SEMANTIC_ERROR (_("need read/write component"), location->components.front()->tok); diff --git a/tapset/bpf/target.stp b/tapset/bpf/target.stp new file mode 100644 index 000000000..ec2dc15aa --- /dev/null +++ b/tapset/bpf/target.stp @@ -0,0 +1,51 @@ +// target tapset +// Copyright (C) 2019 Red Hat Inc. +// +// This file is part of systemtap, and is free software. You can +// redistribute it and/or modify it under the terms of the GNU General +// Public License (GPL); either version 2, or (at your option) any +// later version. + +// TODO: get 'call' instruction to handle the functions below + +/** + * function _set_procfs_value - Used to set the message + * from a procfs-like probe. + * + * Description: This function always returns 0. + */ + +function _set_procfs_value:long (msg:string) +%{/* bpf */ + 0xbf, 1, $msg, -, -; /* mov r1, $msg */ + 0x85, 0, 0, 0, -6; /* call BPF_FUNC_SET_PROCFS_VALUE */ + 0xbf, $$, 0, -, -; /* return r0 */ +%} + +/** + * function _append_procfs_value - Used to append to + * the message for a procfs-like probe. + * + * Description: This function always returns 0. + */ + +function _append_procfs_value:long (append:string) +%{/* bpf */ + 0xbf, 1, $append, -, -; /* mov r1, $msg */ + 0x85, 0, 0, 0, -7; /* call BPF_FUNC_APPEND_PROCFS_VALUE */ + 0xbf, $$, 0, -, -; /* return r0 */ +%} + +/** + * function _get_procfs_value - Used to get the message + * for a procfs-like probe. + * + * Description: This function returns a pointer to the + * message. + */ + +function _get_procfs_value:string () +%{/* bpf */ + 0x85, 0, 0, 0, -8; /* call BPF_FUNC_GET_PROCFS_VALUE */ + 0xbf, $$, 0, -, -; /* return r0 */ +%} diff --git a/tapsets.h b/tapsets.h index b501c3249..c52900ad4 100644 --- a/tapsets.h +++ b/tapsets.h @@ -49,6 +49,9 @@ typedef std::vector > bool sort_for_bpf(systemtap_session& s, generic_kprobe_derived_probe_group *ge, sort_for_bpf_probe_arg_vector &v); +bool sort_for_bpf(systemtap_session& s, + procfs_derived_probe_group *pr, + sort_for_bpf_probe_arg_vector &v); bool sort_for_bpf(systemtap_session& s, hrtimer_derived_probe_group *hr, timer_derived_probe_group *t, diff --git a/testsuite/systemtap.base/procfs_bpf.exp b/testsuite/systemtap.base/procfs_bpf.exp new file mode 100644 index 000000000..7a9c4e9f0 --- /dev/null +++ b/testsuite/systemtap.base/procfs_bpf.exp @@ -0,0 +1,118 @@ +# Test cases for procfs probes with bpf runtime + +set test "PROCFS_BPF" + +if {![installtest_p]} { untested $test; return } + +proc proc_read_value {test path} { + set value "" + if [catch {open $path RDONLY} channel] { + fail "$test $channel" + } else { + set value [read -nonewline $channel] + close $channel + pass "$test read $value" + } + return $value +} + +proc proc_write_value {test path value} { + if [catch {open $path WRONLY} channel] { + fail "$test $channel" + } else { + puts $channel $value + close $channel + pass "$test wrote $value" + } +} + +proc proc_read_write {} { + global test + + set path_read "/var/tmp/systemtap-$user/$test.bo/command" + set path_write "/var/tmp/systemtap-$user/$test.bo/other" + + # read the initial value, which should be '100' + set value [proc_read_value $test $path_read] + if { $value == "100" } { + pass "$test received correct initial value" + } else { + fail "$test received incorrect initial value: $value" + } + + # write a new value of '200' + proc_write_value $test $path_write "200" + + # make sure it got set to '200' + set value [proc_read_value $test $path_read] + if { $value == "200" } { + pass "$test received correct value: 200" + } else { + fail "$test received incorrect value: $value" + } + + # read it again to make sure nothing changed + set value [proc_read_value "$test again" $path_read] + if { $value == "200" } { + pass "$test received correct value: 200 again" + } else { + fail "$test received incorrect value: $value again" + } + + # write a new value of 'hello' + proc_write_value $test $path_write "hello" + + # make sure it got set to 'hello' + set value [proc_read_value $test $path_read] + if { $value == "hello" } { + pass "$test received correct value: hello" + } else { + fail "$test received incorrect value: $value" + } + + # write a new value of 'goodbye' + proc_write_value $test $path_write "goodbye" + + # make sure it got set to 'goodbye' + set value [proc_read_value $test $path_read] + if { $value == "goodbye" } { + pass "$test received correct value: goodbye" + } else { + fail "$test received incorrect value: $value" + } + + return 0; +} + +# The script starts with a value of "100". If the user writes into +# /proc/systemtap/MODNAME/command, that value is returned by the next +# read. + +set systemtap_script { + global saved_value + + probe procfs.read { + $value = saved_value + } + + probe procfs("other").write { + saved_value = $value + } + + probe begin { + saved_value = "100\n" + printf("systemtap starting probe\n") + } + + probe end { + printf("systemtap ending probe\n") + printf("final value = %s", saved_value) + } +} + + +# test procfs probes +set output_string "\\mfinal value = goodbye\\M\r\n" +stap_run $test proc_read_write $output_string --bpf -e $systemtap_script -m $test + +exec /bin/rm -f ${test}.bo -- 2.43.5