stapbpf/stapbpf.cxx

   1 /* stapbpf.cxx - SystemTap BPF loader
   2  *
   3  * This program is free software; you can redistribute it and/or modify
   4  * it under the terms of the GNU General Public License as published by
   5  * the Free Software Foundation; either version 2 of the License, or
   6  * (at your option) any later version.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11  * GNU General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public License
  14  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  15  *
  16  * Copyright (C) 2016-2019 Red Hat, Inc.
  17  *
  18  */
  19
  20 #include <cstdio>
  21 #include <cstdlib>
  22 #include <cstring>
  23 #include <cstdarg>
  24 #include <cassert>
  25 #include <csignal>
  26 #include <cerrno>
  27 #include <fstream>
  28 #include <sstream>
  29 #include <string>
  30 #include <thread>
  31 #include <vector>
  32 #include <unistd.h>
  33 #include <limits.h>
  34 #include <inttypes.h>
  35 #include <getopt.h>
  36 #include <sys/fcntl.h>
  37 #include <sys/ioctl.h>
  38 #include <sys/syscall.h>
  39 #include <sys/mman.h>
  40 #include <sys/utsname.h>
  41 #include <sys/resource.h>
  42 #include "bpfinterp.h"
  43
  44 extern "C" {
  45 #include <linux/bpf.h>
  46 #include <linux/perf_event.h>
  47 /* Introduced in 4.1. */
  48 #ifndef PERF_EVENT_IOC_SET_BPF
  49 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
  50 #endif
  51 #include <libelf.h>
  52 }
  53
  54 #include "config.h"
  55 #include "../git_version.h"
  56 #include "../version.h"
  57 #include "../bpf-internal.h"
  58
  59 #ifndef EM_BPF
  60 #define EM_BPF 0xeb9f
  61 #endif
  62 #ifndef R_BPF_MAP_FD
  63 #define R_BPF_MAP_FD 1
  64 #endif
  65
  66 using namespace std;
  67
  68 static int group_fd = -1;               // ??? Need one per cpu.
  69 extern "C" {
  70 int log_level = 0;
  71 };
  72 int target_pid = 0;
  73 static int warnings = 1;
  74 static int exit_phase = 0;
  75 static int interrupt_message = 0;
  76 static FILE *output_f = stdout;
  77 static FILE *kmsg = NULL;
  78
  79 static const char *module_name;
  80 static const char *module_basename;
  81 static const char *script_name; // name of original systemtap script
  82 static const char *module_license;
  83 static Elf *module_elf;
  84
  85 static uint32_t kernel_version;
  86
  87 // Sized by the contents of the "maps" section.
  88 static bpf_map_def *map_attrs;
  89 static std::vector<int> map_fds;
  90
  91 // PR24543: Some perf constructs must be anchored to a single CPU.
  92 // Normally we use cpu0, but it could (in very rare cases) be disabled.
  93 // Initialized in mark_active_cpus() along with cpu_online.
  94 static int default_cpu = 0;
  95
  96 // Sized by the number of CPUs:
  97 static std::vector<int> perf_fds;
  98 static std::vector<bool> cpu_online; // -- is CPU active?
  99 static std::vector<struct perf_event_mmap_page *> perf_headers;
 100 static std::vector<bpf_transport_context *> transport_contexts;
 101
 102 // Additional info for perf_events transport:
 103 static int perf_event_page_size;
 104 static int perf_event_page_count = 8;
 105 static int perf_event_mmap_size;
 106
 107 // Table of interned strings:
 108 static std::vector<std::string> interned_strings;
 109
 110 // Table of map id's for statistical aggregates:
 111 static std::unordered_map<bpf::globals::agg_idx, bpf::globals::stats_map> aggregates;
 112
 113 // XXX: Required static data and methods from bpf::globals, shared with translator.
 114 #include "../bpf-shared-globals.h"
 115
 116 // Sized by the number of sections, so that we can easily
 117 // look them up by st_shndx.
 118 static std::vector<int> prog_fds;
 119
 120 // Programs to run at begin and end of execution.
 121 static Elf_Data *prog_begin;
 122 static Elf_Data *prog_end;
 123
 124 #define DEBUGFS         "/sys/kernel/debug/tracing/"
 125 #define KPROBE_EVENTS   DEBUGFS "kprobe_events"
 126 #define UPROBE_EVENTS   DEBUGFS "uprobe_events"
 127 #define EVENTS          DEBUGFS "events"
 128
 129 #define CPUFS         "/sys/devices/system/cpu/"
 130 #define CPUS_ONLINE   CPUFS "online"
 131 #define CPUS_POSSIBLE CPUFS "possible"
 132
 133 static void unregister_kprobes(const size_t nprobes);
 134
 135 struct kprobe_data
 136 {
 137   string args;
 138   char type;
 139   int prog_fd;
 140   int event_id;
 141   int event_fd;                         // ??? Need one per cpu.
 142
 143   kprobe_data(char t, string s, int fd)
 144     : args(s), type(t), prog_fd(fd), event_id(-1), event_fd(-1)
 145   { }
 146 };
 147
 148 struct uprobe_data
 149 {
 150   string path;
 151   char type;
 152   int pid;
 153   unsigned long long offset;
 154   int prog_fd;
 155   int event_id;
 156   int event_fd;
 157
 158   uprobe_data(string path, char t, int pid, unsigned long long off, int fd)
 159     : path(path), type(t), pid(pid), offset(off), prog_fd(fd),
 160       event_id(-1), event_fd(-1)
 161   { }
 162 };
 163
 164 struct timer_data
 165 {
 166   unsigned long period;
 167   int prog_fd;
 168   int event_fd;
 169
 170   timer_data(unsigned long period, int fd)
 171     : period(period), prog_fd(fd), event_fd(-1)
 172   { }
 173 };
 174
 175 struct perf_data
 176 {
 177   int event_type;
 178   int event_config;
 179   bool has_freq;
 180   unsigned long interval;
 181   int prog_fd;
 182   int event_fd;
 183
 184   perf_data(int type, int config, bool freq, unsigned long interval, int fd)
 185     : event_type(type), event_config(config), has_freq(freq),
 186       interval(interval), prog_fd(fd), event_fd(-1)
 187   { }
 188 };
 189
 190 struct trace_data
 191 {
 192   string system;
 193   string name;
 194   int prog_fd;
 195   int event_id;
 196   int event_fd;
 197
 198   trace_data(char *s, char *n, int fd)
 199     : system(s), name(n), prog_fd(fd), event_id(-1), event_fd(-1)
 200   { }
 201 };
 202
 203 static std::vector<kprobe_data> kprobes;
 204 static std::vector<timer_data> timers;
 205 static std::vector<perf_data> perf_probes;
 206 static std::vector<trace_data> tracepoint_probes;
 207 static std::vector<uprobe_data> uprobes;
 208
 209 // TODO: Move fatal() to bpfinterp.h and replace abort() calls in the interpreter.
 210 // TODO: Add warn() option.
 211 static void __attribute__((noreturn))
 212 fatal(const char *str, ...)
 213 {
 214   if (module_name)
 215     fprintf(stderr, "Error loading %s: ", module_name);
 216
 217   va_list va;
 218   va_start(va, str);
 219   vfprintf(stderr, str, va);
 220   va_end(va);
 221
 222   exit(1);
 223 }
 224
 225 static void
 226 fatal_sys()
 227 {
 228   fatal("%s\n", strerror(errno));
 229 }
 230
 231 static void
 232 fatal_elf()
 233 {
 234   fatal("%s\n", elf_errmsg(-1));
 235 }
 236
 237
 238 // XXX: based on get_online_cpus()/read_cpu_range()
 239 // in bcc src/cc/common.cc
 240 //
 241 // PR24543: Also sets default_cpu.
 242 //
 243 // This is the only way I know of so far, so I have to imitate it for
 244 // now. Parsing a /sys/devices diagnostic file seems a bit brittle to
 245 // me, though.
 246 static void
 247 mark_active_cpus(unsigned ncpus)
 248 {
 249   std::ifstream cpu_ranges(CPUS_ONLINE);
 250   std::string cpu_range;
 251
 252   // XXX if cpu0 is offline
 253   int alternate_cpu = -1;
 254   bool found_alternate = false;
 255
 256   cpu_online.clear();
 257   for (unsigned i = 0; i < ncpus; i++)
 258     cpu_online.push_back(false);
 259
 260   while (std::getline(cpu_ranges, cpu_range, ','))
 261     {
 262       size_t rangepos = cpu_range.find("-");
 263       int start, end;
 264       if (rangepos == std::string::npos)
 265         {
 266           start = end = std::stoi(cpu_range);
 267         }
 268       else
 269         {
 270           start = std::stoi(cpu_range.substr(0, rangepos));
 271           end = std::stoi(cpu_range.substr(rangepos+1));
 272         }
 273       for (int i = start; i <= end; i++)
 274         {
 275           if (!found_alternate)
 276             {
 277               alternate_cpu = i;
 278               found_alternate = true;
 279             }
 280           cpu_online[i] = true;
 281         }
 282     }
 283
 284   // PR24543: Make sure default_cpu is active.
 285   if (!cpu_online[default_cpu] && found_alternate)
 286     default_cpu = alternate_cpu;
 287 }
 288
 289 static int
 290 count_active_cpus()
 291 {
 292   int count = 0;
 293   for (unsigned cpu = 0; cpu < cpu_online.size(); cpu++)
 294     if (cpu_online[cpu])
 295       count++;
 296   return count;
 297 }
 298
 299 static int
 300 create_group_fds()
 301 {
 302   perf_event_attr peattr;
 303
 304   memset(&peattr, 0, sizeof(peattr));
 305   peattr.size = sizeof(peattr);
 306   peattr.disabled = 1;
 307   peattr.type = PERF_TYPE_SOFTWARE;
 308   peattr.config = PERF_COUNT_SW_DUMMY;
 309
 310   return group_fd = perf_event_open(&peattr, -1, default_cpu, -1, 0);
 311 }
 312
 313 static void
 314 instantiate_maps (Elf64_Shdr *shdr, Elf_Data *data)
 315 {
 316   if (shdr->sh_entsize != sizeof(bpf_map_def))
 317     fatal("map entry size mismatch (%zu != %zu)\n",
 318           (size_t)shdr->sh_entsize, sizeof(bpf_map_def));
 319
 320   size_t i, n = shdr->sh_size / sizeof(bpf_map_def);
 321   struct bpf_map_def *attrs = static_cast<bpf_map_def *>(data->d_buf);
 322
 323   map_attrs = attrs;
 324   map_fds.assign(n, -1);
 325
 326   // XXX: PR24324 -- This overhead space calculation was too
 327   // conservative and caused resource exhaustion errors, disabling it
 328   // until we figure out how much space we need or if the
 329   // RLIM_INFINITY solution below is adequate.
 330 #if 0
 331   /* First, make room for the maps in this process' RLIMIT_MEMLOCK: */
 332   size_t rlimit_increase = 0;
 333   for (i = 0; i < n; ++i)
 334     {
 335       // TODO: The 58 bytes of overhead space per entry has been
 336       // decided by trial and error, and may require further tweaking:
 337       rlimit_increase += (58 + attrs[i].key_size + attrs[i].value_size) * attrs[i].max_entries;
 338       // TODO: Note that Certain Other Tools just give up on
 339       // calculating and set rlimit to the maximum possible.
 340     }
 341 #endif
 342
 343   struct rlimit curr_rlimit;
 344   int rc;
 345
 346   rc = getrlimit(RLIMIT_MEMLOCK, &curr_rlimit);
 347   if (rc < 0)
 348     fatal("could not get map resource limit: %s\n",
 349           strerror(errno));
 350
 351   rlim_t rlim_orig = curr_rlimit.rlim_cur;
 352   rlim_t rlim_max_orig = curr_rlimit.rlim_max;
 353 #if 0
 354   curr_rlimit.rlim_cur += rlimit_increase;
 355   curr_rlimit.rlim_max += rlimit_increase;
 356   if (curr_rlimit.rlim_cur < rlim_orig) // handle overflow
 357     curr_rlimit.rlim_cur = rlim_orig;
 358   if (curr_rlimit.rlim_max < rlim_max_orig) // handle overflow
 359     curr_rlimit.rlim_max = rlim_max_orig;
 360 #endif
 361   // TODOXXX: PR24324 -- EXPERIMENTAL fix for aggressive resource limits.
 362   // Other Tools do something like this but it doesn't solve all our problems.
 363   curr_rlimit.rlim_cur = RLIM_INFINITY;
 364   curr_rlimit.rlim_max = RLIM_INFINITY;
 365
 366   rc = setrlimit(RLIMIT_MEMLOCK, &curr_rlimit);
 367   if (rc < 0)
 368     fatal("could not increase map resource limit -- "
 369           "cur from %lu to %lu, max from %lu to %lu: %s\n",
 370           rlim_orig, curr_rlimit.rlim_cur,
 371           rlim_max_orig, curr_rlimit.rlim_max,
 372           strerror(errno));
 373   if (log_level > 1)
 374     {
 375       fprintf(stderr, "increasing map cur resource limit from %lu to %lu\n",
 376               rlim_orig, curr_rlimit.rlim_cur);
 377       fprintf(stderr, "increasing map max resource limit from %lu to %lu\n",
 378               rlim_max_orig, curr_rlimit.rlim_max);
 379     }
 380
 381   /* Now create the maps: */
 382   for (i = 0; i < n; ++i)
 383     {
 384       /* PR22330: The perf_event_map used for message transport must
 385          have max_entries equal to the number of active CPUs, which we
 386          wouldn't know for sure at translate time. Set it now: */
 387       bpf_map_type map_type = static_cast<bpf_map_type>(attrs[i].type);
 388       if (map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 389         {
 390           /* XXX: Assume our only perf_event_map is the percpu transport one: */
 391           assert(i == bpf::globals::perf_event_map_idx);
 392           assert(attrs[i].max_entries == bpf::globals::NUM_CPUS_PLACEHOLDER);
 393
 394           // TODO: perf_event buffers can only be created for currently
 395           // active CPUs. For now we imitate Certain Other Tools and
 396           // create perf_events for CPUs that are active at startup time
 397           // (while sizing the perf_event_map according to total CPUs).
 398           // But for full coverage, we really need to listen to CPUs
 399           // coming on/offline and adjust accordingly.
 400           long ncpus_ = sysconf(_SC_NPROCESSORS_CONF);
 401           unsigned ncpus = ncpus_ > 0 ? ncpus_ : 1;
 402           if (ncpus_ < 0)
 403             fprintf(stderr, "WARNING: could not get number of CPUs, falling back to 1: %s\n", strerror(errno));
 404           else if (ncpus_ == 0)
 405             fprintf(stderr, "WARNING: could not get number of CPUs, falling back to 1\n"); // XXX no errno
 406           //unsigned ncpus = get_nprocs_conf();
 407           mark_active_cpus((unsigned)ncpus);
 408           attrs[i].max_entries = ncpus;
 409         }
 410
 411       if (log_level > 2)
 412         fprintf(stderr, "creating map type %u entry %zu: key_size %u, value_size %u, "
 413                 "max_entries %u, map_flags %u\n", map_type, i,
 414                 attrs[i].key_size, attrs[i].value_size,
 415                 attrs[i].max_entries, attrs[i].map_flags);
 416       int fd = bpf_create_map(static_cast<bpf_map_type>(attrs[i].type),
 417                               attrs[i].key_size, attrs[i].value_size,
 418                               attrs[i].max_entries, attrs[i].map_flags);
 419       if (fd < 0)
 420         fatal("map entry %zu: %s\n", i, strerror(errno));
 421       map_fds[i] = fd;
 422     }
 423 }
 424
 425 static int
 426 prog_load(Elf_Data *data, const char *name)
 427 {
 428   enum bpf_prog_type prog_type;
 429
 430   if (strncmp(name, "kprobe", 6) == 0)
 431     prog_type = BPF_PROG_TYPE_KPROBE;
 432   else if (strncmp(name, "kretprobe", 9) == 0)
 433     prog_type = BPF_PROG_TYPE_KPROBE;
 434   else if (strncmp(name, "uprobe", 6) == 0)
 435     prog_type = BPF_PROG_TYPE_KPROBE;
 436   else if (strncmp(name, "timer", 5) == 0)
 437     prog_type = BPF_PROG_TYPE_PERF_EVENT;
 438   else if (strncmp(name, "trace", 5) == 0)
 439     prog_type = BPF_PROG_TYPE_TRACEPOINT;
 440   else if (strncmp(name, "perf", 4) == 0)
 441     {
 442       if (name[5] == '2' && name[6] == '/')
 443         prog_type = BPF_PROG_TYPE_TRACEPOINT;
 444       else
 445         prog_type = BPF_PROG_TYPE_PERF_EVENT;
 446     }
 447   else
 448     fatal("unhandled program type for section \"%s\"\n", name);
 449
 450   if (data->d_size % sizeof(bpf_insn))
 451     fatal("program size not a multiple of %zu\n", sizeof(bpf_insn));
 452
 453   if (kmsg != NULL)
 454     {
 455       fprintf (kmsg, "%s (%s): stapbpf: %s, name: %s, d_size: %lu\n",
 456                module_basename, script_name, VERSION, name, (unsigned long)data->d_size);
 457       fflush (kmsg); // Otherwise, flush will only happen after the prog runs.
 458     }
 459   int fd = bpf_prog_load(prog_type, static_cast<bpf_insn *>(data->d_buf),
 460                          data->d_size, module_license, kernel_version);
 461   if (fd < 0)
 462     {
 463       if (bpf_log_buf[0] != 0)
 464         fatal("bpf program load failed: %s\n%s\n",
 465               strerror(errno), bpf_log_buf);
 466       else
 467         fatal("bpf program load failed: %s\n", strerror(errno));
 468     }
 469   return fd;
 470 }
 471
 472 static void
 473 prog_relocate(Elf_Data *prog_data, Elf_Data *rel_data,
 474               Elf_Data *sym_data, Elf_Data *str_data,
 475               const char *prog_name, unsigned maps_idx, bool allocated)
 476 {
 477   bpf_insn *insns = static_cast<bpf_insn *>(prog_data->d_buf);
 478   Elf64_Rel *rels = static_cast<Elf64_Rel *>(rel_data->d_buf);
 479   Elf64_Sym *syms = static_cast<Elf64_Sym *>(sym_data->d_buf);
 480
 481   if (prog_data->d_size % sizeof(bpf_insn))
 482     fatal("program size not a multiple of %zu\n", sizeof(bpf_insn));
 483   if (rel_data->d_type != ELF_T_REL
 484       || rel_data->d_size % sizeof(Elf64_Rel))
 485     fatal("invalid reloc metadata\n");
 486   if (sym_data->d_type != ELF_T_SYM
 487       || sym_data->d_size % sizeof(Elf64_Sym))
 488     fatal("invalid symbol metadata\n");
 489
 490   size_t psize = prog_data->d_size;
 491   size_t nrels = rel_data->d_size / sizeof(Elf64_Rel);
 492   size_t nsyms = sym_data->d_size / sizeof(Elf64_Sym);
 493
 494   for (size_t i = 0; i < nrels; ++i)
 495     {
 496       uint32_t sym = ELF64_R_SYM(rels[i].r_info);
 497       uint32_t type = ELF64_R_TYPE(rels[i].r_info);
 498       unsigned long long r_ofs = rels[i].r_offset;
 499       size_t fd_idx;
 500
 501       if (type != R_BPF_MAP_FD)
 502         fatal("invalid relocation type %u\n", type);
 503       if (sym >= nsyms)
 504         fatal("invalid symbol index %u\n", sym);
 505       if (r_ofs >= psize || r_ofs % sizeof(bpf_insn))
 506         fatal("invalid relocation offset at %s+%llu\n", prog_name, r_ofs);
 507
 508       if (sym >= nsyms)
 509         fatal("invalid relocation symbol %u\n", sym);
 510       if (syms[sym].st_shndx != maps_idx
 511           || syms[sym].st_value % sizeof(bpf_map_def)
 512           || (fd_idx = syms[sym].st_value / sizeof(bpf_map_def),
 513               fd_idx >= map_fds.size()))
 514         {
 515           const char *name = "";
 516           if (syms[sym].st_name < str_data->d_size)
 517             name = static_cast<char *>(str_data->d_buf) + syms[sym].st_name;
 518           if (*name)
 519             fatal("symbol %s does not reference a map\n", name);
 520           else
 521             fatal("symbol %u does not reference a map\n", sym);
 522         }
 523
 524       bpf_insn *insn = insns + (r_ofs / sizeof(bpf_insn));
 525       if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
 526         fatal("invalid relocation insn at %s+%llu\n", prog_name, r_ofs);
 527
 528       insn->src_reg = BPF_PSEUDO_MAP_FD;
 529       insn->imm = (allocated ? map_fds[fd_idx] : fd_idx);
 530     }
 531 }
 532
 533 static void
 534 maybe_collect_kprobe(const char *name, unsigned name_idx,
 535                      unsigned fd_idx, Elf64_Addr offset)
 536 {
 537   char type;
 538   string arg;
 539
 540   if (strncmp(name, "kprobe/", 7) == 0)
 541     {
 542       string line;
 543       const char *stext = NULL;
 544       type = 'p';
 545       name += 7;
 546
 547       ifstream syms("/proc/kallsyms");
 548       if (!syms)
 549         fatal("error opening /proc/kallsyms: %s\n", strerror(errno));
 550
 551       // get value of symbol _stext and add it to the offset found in name.
 552       while (getline(syms, line))
 553         {
 554           const char *l = line.c_str();
 555           if (strncmp(l + 19, "_stext", 6) == 0)
 556             {
 557               stext = l;
 558               break;
 559             }
 560         }
 561
 562       if (stext == NULL)
 563         fatal("could not find _stext in /proc/kallsyms");
 564
 565       unsigned long addr = strtoul(stext, NULL, 16);
 566       addr += strtoul(name, NULL, 16);
 567       stringstream ss;
 568       ss << "0x" << hex << addr;
 569       arg = ss.str();
 570     }
 571   else if (strncmp(name, "kretprobe/", 10) == 0)
 572     type = 'r', arg = name + 10;
 573   else
 574     return;
 575
 576   int fd = -1;
 577   if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
 578     fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
 579   if (offset != 0)
 580     fatal("probe %u offset non-zero\n", name_idx);
 581
 582   kprobes.push_back(kprobe_data(type, arg, fd));
 583 }
 584
 585 static void
 586 collect_uprobe(const char *name, unsigned name_idx, unsigned fd_idx)
 587 {
 588   char type = '\0';
 589   int pid = -1;
 590   unsigned long long off = 0;
 591   char path[PATH_MAX];
 592
 593   int res = sscanf(name, "uprobe/%c/%d/%llu%s", &type, &pid, &off, path);
 594
 595   if (!pid)
 596     pid = -1; // indicates to perf_event_open that we're tracing all processes
 597
 598   if (res != 4)
 599     fatal("unable to parse name of probe %u section %u\n", name_idx, fd_idx);
 600
 601   int fd = -1;
 602   if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
 603     fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
 604
 605   uprobes.push_back(uprobe_data(std::string(path), type, pid, off, fd));
 606 }
 607
 608 static void
 609 collect_perf(const char *name, unsigned name_idx, unsigned fd_idx)
 610 {
 611   char has_freq;
 612   int event_type;
 613   int event_config;
 614   unsigned long interval;
 615
 616   int res = sscanf(name, "perf/%d/%d/%c/%lu",
 617                    &event_type, &event_config, &has_freq, &interval);
 618   if (res != 4)
 619     fatal("unable to parse name of probe %u section %u\n", name_idx, fd_idx);
 620
 621   int fd = -1;
 622   if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
 623     fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
 624
 625   perf_probes.push_back(
 626     perf_data(event_type, event_config, has_freq == 'f', interval, fd));
 627 }
 628
 629 static void
 630 collect_timer(const char *name, unsigned name_idx, unsigned fd_idx)
 631 {
 632   unsigned long period = strtoul(name + 11, NULL, 10);
 633
 634   if (strncmp(name + 6, "jiff/", 5) == 0)
 635     {
 636       long jiffies_per_sec = sysconf(_SC_CLK_TCK);
 637       period *= 1e9 / jiffies_per_sec;
 638     }
 639
 640   int fd = -1;
 641   if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
 642     fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
 643
 644   timers.push_back(timer_data(period, fd));
 645   return;
 646 }
 647
 648 static void
 649 collect_tracepoint(const char *name, unsigned name_idx, unsigned fd_idx)
 650 {
 651   char tp_system[512];
 652   char tp_name[512];
 653
 654   int res = sscanf(name, "trace/%[^/]/%s", tp_system, tp_name);
 655   if (res != 2 || strlen(name) > 512)
 656     fatal("unable to parse name of probe %u section %u\n", name_idx, fd_idx);
 657
 658   int fd = -1;
 659   if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
 660     fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
 661
 662   tracepoint_probes.push_back(trace_data(tp_system, tp_name, fd));
 663 }
 664
 665 static void
 666 kprobe_collect_from_syms(Elf_Data *sym_data, Elf_Data *str_data)
 667 {
 668   Elf64_Sym *syms = static_cast<Elf64_Sym *>(sym_data->d_buf);
 669   size_t nsyms = sym_data->d_type / sizeof(Elf64_Sym);
 670
 671   if (sym_data->d_type != ELF_T_SYM
 672       || sym_data->d_size % sizeof(Elf64_Sym))
 673     fatal("invalid kprobes symbol metadata\n");
 674
 675   for (size_t i = 0; i < nsyms; ++i)
 676     {
 677       const char *name;
 678       if (syms[i].st_name < str_data->d_size)
 679         name = static_cast<char *>(str_data->d_buf) + syms[i].st_name;
 680       else
 681         fatal("symbol %u has invalid string index\n", i);
 682       maybe_collect_kprobe(name, i, syms[i].st_shndx, syms[i].st_value);
 683     }
 684 }
 685
 686 static void
 687 unregister_uprobes(const size_t nprobes)
 688 {
 689    if (nprobes == 0)
 690     return;
 691
 692   int fd = open(DEBUGFS "uprobe_events", O_WRONLY);
 693   if (fd < 0)
 694     return;
 695
 696
 697   const int pid = getpid();
 698   for (size_t i = 0; i < nprobes; ++i)
 699     {
 700       close(uprobes[i].event_fd);
 701
 702       char msgbuf[128];
 703       ssize_t olen = snprintf(msgbuf, sizeof(msgbuf), "-:stapprobe_%d_%zu",
 704                               pid, i);
 705       ssize_t wlen = write(fd, msgbuf, olen);
 706       if (wlen < 0)
 707         fprintf(stderr, "Error removing probe %zu: %s\n",
 708                 i, strerror(errno));
 709     }
 710   close(fd);
 711 }
 712
 713 static void
 714 register_uprobes()
 715 {
 716   size_t nprobes = uprobes.size();
 717   if (nprobes == 0)
 718     return;
 719
 720   int fd = open(UPROBE_EVENTS, O_WRONLY);
 721   if (fd < 0)
 722     fatal("Error opening %s: %s\n", UPROBE_EVENTS, strerror(errno));
 723
 724   const int pid = getpid();
 725
 726   for (size_t i = 0; i < nprobes; ++i)
 727     {
 728       uprobe_data &u = uprobes[i];
 729       char msgbuf[PATH_MAX];
 730
 731       ssize_t olen = snprintf(msgbuf, sizeof(msgbuf), "%c:stapprobe_%d_%zu %s:0x%llx",
 732                               u.type, pid, i, u.path.c_str(), u.offset);
 733       if ((size_t)olen >= sizeof(msgbuf))
 734         {
 735           fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
 736           if (i == 0)
 737             goto fail_0;
 738           nprobes = i - 1;
 739           goto fail_n;
 740         }
 741
 742       if (log_level > 1)
 743         fprintf(stderr, "Associating probe %zu with uprobe %s\n", i, msgbuf);
 744
 745       ssize_t wlen = write(fd, msgbuf, olen);
 746       if (wlen != olen)
 747         {
 748           fprintf(stderr, "Error creating probe %zu: %s\n",
 749                   i, strerror(errno));
 750           if (i == 0)
 751             goto fail_0;
 752           nprobes = i - 1;
 753           goto fail_n;
 754         }
 755     }
 756   close(fd);
 757
 758   for (size_t i = 0; i < nprobes; ++i)
 759     {
 760       char fnbuf[PATH_MAX];
 761       ssize_t len = snprintf(fnbuf, sizeof(fnbuf),
 762                              DEBUGFS "events/uprobes/stapprobe_%d_%zu/id", pid, i);
 763       if ((size_t)len >= sizeof(bpf_log_buf))
 764         {
 765           fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
 766           goto fail_n;
 767         }
 768
 769       fd = open(fnbuf, O_RDONLY);
 770       if (fd < 0)
 771         {
 772           fprintf(stderr, "Error opening probe event id %zu: %s\n",
 773                   i, strerror(errno));
 774           goto fail_n;
 775         }
 776
 777       char msgbuf[128];
 778       len = read(fd, msgbuf, sizeof(msgbuf) - 1);
 779       if (len < 0)
 780         {
 781           fprintf(stderr, "Error reading probe event id %zu: %s\n",
 782                   i, strerror(errno));
 783           goto fail_n;
 784         }
 785       close(fd);
 786
 787       msgbuf[len] = 0;
 788       uprobes[i].event_id = atoi(msgbuf);
 789     }
 790
 791   // ??? Iterate to enable on all cpus, each with a different group_fd.
 792   {
 793     perf_event_attr peattr;
 794
 795     memset(&peattr, 0, sizeof(peattr));
 796     peattr.size = sizeof(peattr);
 797     peattr.type = PERF_TYPE_TRACEPOINT;
 798     peattr.sample_type = PERF_SAMPLE_RAW;
 799     peattr.sample_period = 1;
 800     peattr.wakeup_events = 1;
 801
 802     for (size_t i = 0; i < nprobes; ++i)
 803       {
 804         uprobe_data &u = uprobes[i];
 805         peattr.config = u.event_id;
 806
 807         fd = perf_event_open(&peattr, u.pid, default_cpu, -1, 0);
 808         if (fd < 0)
 809           {
 810             fprintf(stderr, "Error opening probe id %zu: %s\n",
 811                     i, strerror(errno));
 812             goto fail_n;
 813           }
 814         u.event_fd = fd;
 815
 816         if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, u.prog_fd) < 0)
 817           {
 818             fprintf(stderr, "Error installing bpf for probe id %zu: %s\n",
 819                     i, strerror(errno));
 820             goto fail_n;
 821           }
 822       }
 823   }
 824   return;
 825
 826  fail_n:
 827   unregister_uprobes(nprobes);
 828  fail_0:
 829   exit(1);
 830 }
 831
 832 static void
 833 register_kprobes()
 834 {
 835   size_t nprobes = kprobes.size();
 836   if (nprobes == 0)
 837     return;
 838
 839   int fd = open(KPROBE_EVENTS, O_WRONLY);
 840   if (fd < 0)
 841     fatal("Error opening %s: %s\n", KPROBE_EVENTS, strerror(errno));
 842
 843   const int pid = getpid();
 844
 845   for (size_t i = 0; i < nprobes; ++i)
 846     {
 847       kprobe_data &k = kprobes[i];
 848       char msgbuf[128];
 849
 850       ssize_t olen = snprintf(msgbuf, sizeof(msgbuf), "%c:p%d_%zu %s",
 851                               k.type, pid, i, k.args.c_str());
 852       if ((size_t)olen >= sizeof(msgbuf))
 853         {
 854           fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
 855           if (i == 0)
 856             goto fail_0;
 857           nprobes = i - 1;
 858           goto fail_n;
 859         }
 860
 861       if (log_level > 1)
 862         fprintf(stderr, "Associating probe %zu with kprobe %s\n", i, msgbuf);
 863
 864       ssize_t wlen = write(fd, msgbuf, olen);
 865       if (wlen != olen)
 866         {
 867           fprintf(stderr, "Error creating probe %zu: %s\n",
 868                   i, strerror(errno));
 869           if (i == 0)
 870             goto fail_0;
 871           nprobes = i - 1;
 872           goto fail_n;
 873         }
 874     }
 875   close(fd);
 876
 877   for (size_t i = 0; i < nprobes; ++i)
 878     {
 879       char fnbuf[PATH_MAX];
 880       ssize_t len = snprintf(fnbuf, sizeof(fnbuf),
 881                              DEBUGFS "events/kprobes/p%d_%zu/id", pid, i);
 882       if ((size_t)len >= sizeof(bpf_log_buf))
 883         {
 884           fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
 885           goto fail_n;
 886         }
 887
 888       fd = open(fnbuf, O_RDONLY);
 889       if (fd < 0)
 890         {
 891           fprintf(stderr, "Error opening probe event id %zu: %s\n",
 892                   i, strerror(errno));
 893           goto fail_n;
 894         }
 895
 896       char msgbuf[128];
 897       len = read(fd, msgbuf, sizeof(msgbuf) - 1);
 898       if (len < 0)
 899         {
 900           fprintf(stderr, "Error reading probe event id %zu: %s\n",
 901                   i, strerror(errno));
 902           goto fail_n;
 903         }
 904       close(fd);
 905
 906       msgbuf[len] = 0;
 907       kprobes[i].event_id = atoi(msgbuf);
 908     }
 909
 910   // ??? Iterate to enable on all cpus, each with a different group_fd.
 911   {
 912     perf_event_attr peattr;
 913
 914     memset(&peattr, 0, sizeof(peattr));
 915     peattr.size = sizeof(peattr);
 916     peattr.type = PERF_TYPE_TRACEPOINT;
 917     peattr.sample_type = PERF_SAMPLE_RAW;
 918     peattr.sample_period = 1;
 919     peattr.wakeup_events = 1;
 920
 921     for (size_t i = 0; i < nprobes; ++i)
 922       {
 923         kprobe_data &k = kprobes[i];
 924         peattr.config = k.event_id;
 925
 926         fd = perf_event_open(&peattr, -1, default_cpu, group_fd, 0);
 927         if (fd < 0)
 928           {
 929             fprintf(stderr, "Error opening probe id %zu: %s\n",
 930                     i, strerror(errno));
 931             goto fail_n;
 932           }
 933         k.event_fd = fd;
 934
 935         if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, k.prog_fd) < 0)
 936           {
 937             fprintf(stderr, "Error installing bpf for probe id %zu: %s\n",
 938                     i, strerror(errno));
 939             goto fail_n;
 940           }
 941       }
 942   }
 943   return;
 944
 945  fail_n:
 946   unregister_kprobes(nprobes);
 947  fail_0:
 948   exit(1);
 949 }
 950
 951 static void
 952 unregister_kprobes(const size_t nprobes)
 953 {
 954   if (nprobes == 0)
 955     return;
 956
 957   int fd = open(DEBUGFS "kprobe_events", O_WRONLY);
 958   if (fd < 0)
 959     return;
 960
 961
 962   const int pid = getpid();
 963   for (size_t i = 0; i < nprobes; ++i)
 964     {
 965       close(kprobes[i].event_fd);
 966
 967       char msgbuf[128];
 968       ssize_t olen = snprintf(msgbuf, sizeof(msgbuf), "-:p%d_%zu",
 969                               pid, i);
 970       ssize_t wlen = write(fd, msgbuf, olen);
 971       if (wlen < 0)
 972         fprintf(stderr, "Error removing probe %zu: %s\n",
 973                 i, strerror(errno));
 974     }
 975   close(fd);
 976 }
 977
 978 static void
 979 unregister_tracepoints(const size_t nprobes)
 980 {
 981   for (size_t i = 0; i < nprobes; ++i)
 982     close(tracepoint_probes[i].event_fd);
 983 }
 984
 985 static void
 986 register_tracepoints()
 987 {
 988   size_t nprobes = tracepoint_probes.size();
 989   if (nprobes == 0)
 990     return;
 991
 992   for (size_t i = 0; i < nprobes; ++i)
 993     {
 994       trace_data &t = tracepoint_probes[i];
 995       char fnbuf[PATH_MAX];
 996       ssize_t len = snprintf(fnbuf, sizeof(fnbuf),
 997                              DEBUGFS "events/%s/%s/id",
 998                              t.system.c_str(), t.name.c_str());
 999       if ((size_t)len >= sizeof(bpf_log_buf))
1000         {
1001           fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
1002           goto fail;
1003         }
1004
1005       int fd = open(fnbuf, O_RDONLY);
1006       if (fd < 0)
1007         {
1008           fprintf(stderr, "Error opening probe event id %zu: %s\n",
1009                   i, strerror(errno));
1010
1011           if (errno == ENOENT)
1012             fprintf(stderr, "\"%s/%s\" could not be found in %s\n",
1013                     t.system.c_str(), t.name.c_str(), EVENTS);
1014
1015           goto fail;
1016         }
1017
1018       char msgbuf[128];
1019       len = read(fd, msgbuf, sizeof(msgbuf) - 1);
1020       if (len < 0)
1021         {
1022           fprintf(stderr, "Error reading probe event id %zu: %s\n",
1023                   i, strerror(errno));
1024           close(fd);
1025           goto fail;
1026         }
1027       close(fd);
1028
1029       msgbuf[len] = 0;
1030       t.event_id = atoi(msgbuf);
1031     }
1032
1033   // ??? Iterate to enable on all cpus, each with a different group_fd.
1034   {
1035     perf_event_attr peattr;
1036
1037     memset(&peattr, 0, sizeof(peattr));
1038     peattr.size = sizeof(peattr);
1039     peattr.type = PERF_TYPE_TRACEPOINT;
1040     peattr.sample_type = PERF_SAMPLE_RAW;
1041     peattr.sample_period = 1;
1042     peattr.wakeup_events = 1;
1043
1044     for (size_t i = 0; i < nprobes; ++i)
1045       {
1046         trace_data &t = tracepoint_probes[i];
1047         peattr.config = t.event_id;
1048
1049         int fd = perf_event_open(&peattr, -1, default_cpu, group_fd, 0);
1050         if (fd < 0)
1051           {
1052             fprintf(stderr, "Error opening probe id %zu: %s\n",
1053                     i, strerror(errno));
1054             goto fail;
1055           }
1056         t.event_fd = fd;
1057
1058         if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, t.prog_fd) < 0)
1059           {
1060             fprintf(stderr, "Error installing bpf for probe id %zu: %s\n",
1061                     i, strerror(errno));
1062             goto fail;
1063           }
1064       }
1065   }
1066   return;
1067
1068  fail:
1069   unregister_tracepoints(nprobes);
1070   exit(1);
1071 }
1072
1073 static void
1074 unregister_timers(const size_t nprobes)
1075 {
1076   for (size_t i = 0; i < nprobes; ++i)
1077     close(timers[i].event_fd);
1078 }
1079
1080 static void
1081 register_timers()
1082 {
1083   perf_event_attr peattr;
1084
1085   memset(&peattr, 0, sizeof(peattr));
1086   peattr.size = sizeof(peattr);
1087   peattr.type = PERF_TYPE_SOFTWARE;
1088   peattr.config = PERF_COUNT_SW_CPU_CLOCK;
1089
1090   for (size_t i = 0; i < timers.size(); ++i)
1091     {
1092       timer_data &t = timers[i];
1093       peattr.sample_period = t.period;
1094
1095       int fd = perf_event_open(&peattr, -1, default_cpu, group_fd, 0);
1096       if (fd < 0)
1097         {
1098           int err = errno;
1099           unregister_timers(timers.size());
1100           fatal("Error opening timer probe id %zu: %s\n", i + 1, strerror(err));
1101         }
1102
1103       t.event_fd = fd;
1104       if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, t.prog_fd) < 0)
1105         {
1106           int err = errno;
1107           unregister_timers(timers.size());
1108           fatal("Error installing bpf for timer probe id %zu: %s\n",
1109                 i + 1, strerror(err));
1110         }
1111     }
1112
1113   return;
1114 }
1115
1116 static void
1117 unregister_perf(const size_t nprobes)
1118 {
1119   for (size_t i = 0; i < nprobes; ++i)
1120     close(perf_probes[i].event_fd);
1121 }
1122
1123 static void
1124 register_perf()
1125 {
1126   for (size_t i = 0; i < perf_probes.size(); ++i)
1127     {
1128       perf_data &p = perf_probes[i];
1129       perf_event_attr peattr;
1130
1131       memset(&peattr, 0, sizeof(peattr));
1132       peattr.size = sizeof(peattr);
1133       peattr.type = p.event_type;
1134       peattr.config = p.event_config;
1135
1136       if (p.has_freq)
1137         {
1138           peattr.freq = 1;
1139           peattr.sample_freq = p.interval;
1140         }
1141       else
1142         peattr.sample_period = p.interval;
1143
1144       // group_fd is not used since this event might have an
1145       // incompatible type/config.
1146       int fd = perf_event_open(&peattr, -1, default_cpu, -1, 0);
1147       if (fd < 0)
1148         {
1149           int err = errno;
1150           unregister_perf(perf_probes.size());
1151           fatal("Error opening perf probe id %zu: %s\n", i + 1, strerror(err));
1152         }
1153
1154       p.event_fd = fd;
1155       if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, p.prog_fd) < 0)
1156         {
1157           int err = errno;
1158           unregister_perf(perf_probes.size());
1159           fatal("Error installing bpf for perf probe id %zu: %s\n",
1160                 i + 1, strerror(err));
1161         }
1162     }
1163 }
1164
1165 static void
1166 init_internal_globals()
1167 {
1168   using namespace bpf;
1169
1170   int key = globals::EXIT;
1171   long val = 0;
1172
1173   if (bpf_update_elem(map_fds[globals::internal_map_idx],
1174                      (void*)&key, (void*)&val, BPF_ANY) != 0)
1175     fatal("Error updating pid: %s\n", strerror(errno));
1176
1177 }
1178
1179 // PR22330: Initialize perf_event_map and perf_fds.
1180 static void
1181 init_perf_transport()
1182 {
1183   using namespace bpf;
1184
1185   unsigned ncpus = map_attrs[globals::perf_event_map_idx].max_entries;
1186
1187   for (unsigned cpu = 0; cpu < ncpus; cpu++)
1188     {
1189       if (!cpu_online[cpu]) // -- skip inactive CPUs.
1190         {
1191           perf_fds.push_back(-1);
1192           transport_contexts.push_back(nullptr);
1193           continue;
1194         }
1195
1196       struct perf_event_attr peattr;
1197
1198       memset(&peattr, 0, sizeof(peattr));
1199       peattr.size = sizeof(peattr);
1200       peattr.sample_type = PERF_SAMPLE_RAW;
1201       peattr.type = PERF_TYPE_SOFTWARE;
1202       peattr.config = PERF_COUNT_SW_BPF_OUTPUT;
1203       peattr.sample_period = 1;
1204       peattr.wakeup_events = 1;
1205
1206       int pmu_fd = perf_event_open(&peattr, -1/*pid*/, cpu, -1/*group_fd*/, 0);
1207       if (pmu_fd < 0)
1208         fatal("Error initializing perf event for cpu %d: %s\n", cpu, strerror(errno));
1209       if (bpf_update_elem(map_fds[globals::perf_event_map_idx],
1210                           (void*)&cpu, (void*)&pmu_fd, BPF_ANY) != 0)
1211         fatal("Error assigning perf event for cpu %d: %s\n", cpu, strerror(errno));
1212       ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
1213       perf_fds.push_back(pmu_fd);
1214
1215       // Create a data structure to track what's happening on each CPU:
1216       bpf_transport_context *ctx
1217         = new bpf_transport_context(cpu, pmu_fd, ncpus, map_attrs, &map_fds,
1218                                     output_f, &interned_strings, &aggregates);
1219       transport_contexts.push_back(ctx);
1220     }
1221
1222   // XXX: based on perf_event_mmap_header()
1223   // in kernel tools/testing/selftests/bpf/trace_helpers.c
1224   perf_event_page_size = getpagesize();
1225   perf_event_mmap_size = perf_event_page_size * (perf_event_page_count + 1);
1226   for (unsigned cpu = 0; cpu < ncpus; cpu++)
1227     {
1228       if (!cpu_online[cpu]) // -- skip inactive CPUs.
1229         {
1230           perf_headers.push_back(nullptr);
1231           continue;
1232         }
1233
1234       int pmu_fd = perf_fds[cpu];
1235       void *base = mmap(NULL, perf_event_mmap_size,
1236                         PROT_READ | PROT_WRITE, MAP_SHARED,
1237                         pmu_fd, 0);
1238       if (base == MAP_FAILED)
1239         fatal("error mmapping header for perf_event fd %d\n", pmu_fd);
1240       perf_headers.push_back((perf_event_mmap_page*)base);
1241       if (log_level > 2)
1242         fprintf(stderr, "Initialized perf_event output on cpu %d\n", cpu);
1243     }
1244 }
1245
1246 static void
1247 load_bpf_file(const char *module)
1248 {
1249   module_name = module;
1250
1251   /* Extract basename: */
1252   char *buf = (char *)malloc(BPF_MAXSTRINGLEN * sizeof(char));
1253   string module_name_str(module);
1254   string module_basename_str
1255     = module_name_str.substr(module_name_str.rfind('/')+1); // basename
1256   size_t len = module_basename_str.copy(buf, BPF_MAXSTRINGLEN-1);
1257   buf[len] = '\0';
1258   module_basename = buf;
1259
1260   int fd = open(module, O_RDONLY);
1261   if (fd < 0)
1262     fatal_sys();
1263
1264   elf_version(EV_CURRENT);
1265
1266   Elf *elf = elf_begin(fd, ELF_C_READ_MMAP_PRIVATE, NULL);
1267   if (elf == NULL)
1268     fatal_elf();
1269   module_elf = elf;
1270
1271   Elf64_Ehdr *ehdr = elf64_getehdr(elf);
1272   if (ehdr == NULL)
1273     fatal_elf();
1274
1275   // Byte order should match the host, since we're loading locally.
1276   {
1277     const char *end_str;
1278     switch (ehdr->e_ident[EI_DATA])
1279       {
1280       case ELFDATA2MSB:
1281         if (__BYTE_ORDER == __BIG_ENDIAN)
1282           break;
1283         end_str = "MSB";
1284         goto err_endian;
1285       case ELFDATA2LSB:
1286         if (__BYTE_ORDER == __LITTLE_ENDIAN)
1287           break;
1288         end_str = "LSB";
1289         goto err_endian;
1290       case ELFCLASSNONE:
1291         end_str = "none";
1292         goto err_endian;
1293       default:
1294         end_str = "unknown";
1295       err_endian:
1296         fatal("incorrect byte ordering: %s\n", end_str);
1297       }
1298   }
1299
1300   // Tiny bit of sanity checking on the rest of the header.  Since LLVM
1301   // began by producing files with EM_NONE, accept that too.
1302   if (ehdr->e_machine != EM_NONE && ehdr->e_machine != EM_BPF)
1303     fatal("incorrect machine type: %d\n", ehdr->e_machine);
1304
1305   unsigned shnum = ehdr->e_shnum;
1306   prog_fds.assign(shnum, -1);
1307
1308   std::vector<Elf64_Shdr *> shdrs(shnum, NULL);
1309   std::vector<Elf_Data *> sh_data(shnum, NULL);
1310   std::vector<const char *> sh_name(shnum, NULL);
1311   unsigned maps_idx = 0;
1312   unsigned version_idx = 0;
1313   unsigned license_idx = 0;
1314   unsigned script_name_idx = 0;
1315   unsigned interned_strings_idx = 0;
1316   unsigned aggregates_idx = 0;
1317   unsigned kprobes_idx = 0;
1318   unsigned begin_idx = 0;
1319   unsigned end_idx = 0;
1320
1321   // First pass to identify special sections, and make sure
1322   // all data is readable.
1323   for (unsigned i = 1; i < shnum; ++i)
1324     {
1325       Elf_Scn *scn = elf_getscn(elf, i);
1326       if (!scn)
1327         fatal_elf();
1328
1329       Elf64_Shdr *shdr = elf64_getshdr(scn);
1330       if (!shdr)
1331         fatal_elf();
1332
1333       const char *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
1334       if (!shname)
1335         fatal_elf();
1336
1337       // We need not consider any empty sections.
1338       if (shdr->sh_size == 0 || !*shname)
1339         continue;
1340
1341       Elf_Data *data = elf_getdata(scn, NULL);
1342       if (data == NULL)
1343         fatal_elf();
1344
1345       shdrs[i] = shdr;
1346       sh_name[i] = shname;
1347       sh_data[i] = data;
1348
1349       if (strcmp(shname, "license") == 0)
1350         license_idx = i;
1351       else if (strcmp(shname, "stapbpf_script_name") == 0)
1352         script_name_idx = i;
1353       else if (strcmp(shname, "stapbpf_interned_strings") == 0)
1354         interned_strings_idx = i;
1355       else if (strcmp(shname, "stapbpf_aggregates") == 0)
1356         aggregates_idx = i;
1357       else if (strcmp(shname, "version") == 0)
1358         version_idx = i;
1359       else if (strcmp(shname, "maps") == 0)
1360         maps_idx = i;
1361       else if (strcmp(shname, "kprobes") == 0)
1362         kprobes_idx = i;
1363       else if (strcmp(shname, "stap_begin") == 0)
1364         begin_idx = i;
1365       else if (strcmp(shname, "stap_end") == 0)
1366         end_idx = i;
1367     }
1368
1369   // Two special sections are not optional.
1370   if (license_idx != 0)
1371     module_license = static_cast<char *>(sh_data[license_idx]->d_buf);
1372   else
1373     fatal("missing license section\n");
1374   if (script_name_idx != 0)
1375     script_name = static_cast<char *>(sh_data[script_name_idx]->d_buf);
1376   else
1377     script_name = "<unknown>";
1378   if (version_idx != 0)
1379     {
1380       unsigned long long size = shdrs[version_idx]->sh_size;
1381       if (size != 4)
1382         fatal("invalid version size (%llu)\n", size);
1383       memcpy(&kernel_version, sh_data[version_idx]->d_buf, 4);
1384     }
1385   else
1386     fatal("missing version section\n");
1387
1388   // Create bpf maps as required.
1389   if (maps_idx != 0)
1390     instantiate_maps(shdrs[maps_idx], sh_data[maps_idx]);
1391
1392   // Create interned strings as required.
1393   if (interned_strings_idx != 0)
1394     {
1395       // XXX: Whatever the type used by the translator, this section
1396       // just holds a blob of NUL-terminated strings we parse as follows:
1397       char *strtab = static_cast<char *>(sh_data[interned_strings_idx]->d_buf);
1398       unsigned long long strtab_size = shdrs[interned_strings_idx]->sh_size;
1399       unsigned ofs = 0;
1400       bool found_hdr = false;
1401       while (ofs < strtab_size)
1402         {
1403           // XXX: Potentially vulnerable to NUL byte in string constant.
1404           std::string str(strtab+ofs); // XXX: will slurp up to NUL byte
1405           if (str.size() == 0 && !found_hdr)
1406             found_hdr = true; // section *may* start with an extra NUL byte
1407           else
1408             interned_strings.push_back(str);
1409           ofs += str.size() + 1;
1410         }
1411     }
1412
1413   // PR23476: Initialize table of statistical aggregates.
1414   if (aggregates_idx != 0)
1415     {
1416       uint64_t *aggtab = static_cast<uint64_t *>(sh_data[aggregates_idx]->d_buf);
1417       unsigned long long aggtab_size = shdrs[aggregates_idx]->sh_size;
1418       unsigned ofs = 0; unsigned i = 0;
1419       while (ofs < aggtab_size)
1420         {
1421           bpf::globals::agg_idx agg_id = (bpf::globals::agg_idx)aggtab[i];
1422           bpf::globals::interned_stats_map ism;
1423           for (unsigned j = 0; j < bpf::globals::stat_fields.size(); j++)
1424             {
1425               ism.push_back(aggtab[i+1+j]);
1426             }
1427           aggregates[agg_id] = bpf::globals::deintern_stats_map(ism);
1428           i += 1 + bpf::globals::stat_fields.size();
1429           ofs = sizeof(uint64_t) * i;
1430         }
1431     }
1432
1433   // Relocate all programs that require it.
1434   for (unsigned i = 1; i < shnum; ++i)
1435     {
1436       Elf64_Shdr *rel_hdr = shdrs[i];
1437       if (rel_hdr == NULL || rel_hdr->sh_type != SHT_REL)
1438         continue;
1439
1440       unsigned progi = rel_hdr->sh_info;
1441       if (progi == 0 || progi >= shnum)
1442         fatal("invalid section info %u->%u\n", i, progi);
1443       Elf64_Shdr *prog_hdr = shdrs[progi];
1444
1445       unsigned symi = rel_hdr->sh_link;
1446       if (symi == 0 || symi >= shnum)
1447         fatal("invalid section link %u->%u\n", i, symi);
1448       Elf64_Shdr *sym_hdr = shdrs[symi];
1449
1450       unsigned stri = sym_hdr->sh_link;
1451       if (stri == 0 || stri >= shnum)
1452         fatal("invalid section link %u->%u\n", symi, stri);
1453
1454       if (prog_hdr->sh_flags & SHF_EXECINSTR)
1455         prog_relocate(sh_data[progi], sh_data[i], sh_data[symi],
1456                       sh_data[stri], sh_name[progi], maps_idx,
1457                       prog_hdr->sh_flags & SHF_ALLOC);
1458     }
1459
1460   // Load all programs that require it.
1461   for (unsigned i = 1; i < shnum; ++i)
1462     {
1463       Elf64_Shdr *shdr = shdrs[i];
1464       if ((shdr->sh_flags & SHF_ALLOC) && (shdr->sh_flags & SHF_EXECINSTR))
1465         prog_fds[i] = prog_load(sh_data[i], sh_name[i]);
1466     }
1467
1468   // Remember begin and end probes.
1469   if (begin_idx)
1470     {
1471       Elf64_Shdr *shdr = shdrs[begin_idx];
1472       if (shdr->sh_flags & SHF_EXECINSTR)
1473         prog_begin = sh_data[begin_idx];
1474     }
1475   if (end_idx)
1476     {
1477       Elf64_Shdr *shdr = shdrs[end_idx];
1478       if (shdr->sh_flags & SHF_EXECINSTR)
1479         prog_end = sh_data[end_idx];
1480     }
1481
1482   // Record all kprobes.
1483   if (kprobes_idx != 0)
1484     {
1485       // The Preferred Systemtap Way puts kprobe strings into a symbol
1486       // table, so that multiple kprobes can reference the same program.
1487
1488       // ??? We don't really have to have a separate kprobe symbol table;
1489       // we could pull kprobes out of the main symbol table too.  This
1490       // would probably make it easier for llvm-bpf folks to transition.
1491       // One would only need to create symbol aliases with custom asm names.
1492
1493       Elf64_Shdr *sym_hdr = shdrs[kprobes_idx];
1494       if (sym_hdr->sh_type != SHT_SYMTAB)
1495         fatal("invalid section type for kprobes section\n");
1496
1497       unsigned stri = sym_hdr->sh_link;
1498       if (stri == 0 || stri >= shnum)
1499         fatal("invalid section link %u->%u\n", kprobes_idx, stri);
1500
1501       kprobe_collect_from_syms(sh_data[kprobes_idx], sh_data[stri]);
1502     }
1503   else
1504     {
1505       // The original llvm-bpf way puts kprobe strings into the
1506       // section name.  Each kprobe has its own program.
1507       for (unsigned i = 1; i < shnum; ++i)
1508         maybe_collect_kprobe(sh_name[i], i, i, 0);
1509     }
1510
1511   // Record all other probes
1512   for (unsigned i = 1; i < shnum; ++i) {
1513     if (strncmp(sh_name[i], "uprobe", 6) == 0)
1514       collect_uprobe(sh_name[i], i, i);
1515     if (strncmp(sh_name[i], "trace", 5) == 0)
1516       collect_tracepoint(sh_name[i], i, i);
1517     if (strncmp(sh_name[i], "perf", 4) == 0)
1518       collect_perf(sh_name[i], i, i);
1519     if (strncmp(sh_name[i], "timer", 5) == 0)
1520       collect_timer(sh_name[i], i, i);
1521   }
1522 }
1523
1524 static int
1525 get_exit_status()
1526 {
1527   int key = bpf::globals::EXIT;
1528   long val = 0;
1529
1530   if (bpf_lookup_elem
1531        (map_fds[bpf::globals::internal_map_idx], &key, &val) != 0)
1532     fatal("error during bpf map lookup: %s\n", strerror(errno));
1533
1534   return val;
1535 }
1536
1537 // XXX: based on perf_event_sample
1538 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1539 struct perf_event_sample {
1540   struct perf_event_header header;
1541   __u32 size;
1542   char data[];
1543 };
1544
1545 static enum bpf_perf_event_ret
1546 perf_event_handle(struct perf_event_header *hdr, void *private_data)
1547 {
1548   // XXX: based on bpf_perf_event_print
1549   // in kernel tools/testing/selftests/bpf/trace_helpers.c
1550
1551   struct perf_event_sample *e = (struct perf_event_sample *)hdr;
1552   bpf_transport_context *ctx = (bpf_transport_context *)private_data;
1553   bpf_perf_event_ret ret;
1554
1555   // Make sure we weren't passed a userspace context by accident.
1556   assert(ctx->pmu_fd >= 0);
1557
1558   if (e->header.type == PERF_RECORD_SAMPLE)
1559     {
1560       __u32 actual_size = e->size - sizeof(e->size);
1561       ret = bpf_handle_transport_msg(e->data, actual_size, ctx);
1562       if (ret != LIBBPF_PERF_EVENT_CONT)
1563         return ret;
1564     }
1565   else if (e->header.type == PERF_RECORD_LOST)
1566     {
1567       struct lost_events {
1568         struct perf_event_header header;
1569         __u64 id;
1570         __u64 lost;
1571       };
1572       struct lost_events *lost = (lost_events *) e;
1573       fprintf(stderr, "WARNING: lost %lld perf_events on cpu %d\n",
1574               (long long)lost->lost, ctx->cpu);
1575     }
1576   else
1577     {
1578       fprintf(stderr, "WARNING: unknown perf_event type=%d size=%d on cpu %d\n",
1579               e->header.type, e->header.size, ctx->cpu);
1580     }
1581   return LIBBPF_PERF_EVENT_CONT;
1582 }
1583
1584 // PR22330: Listen for perf_events.
1585 static void
1586 perf_event_loop(pthread_t main_thread)
1587 {
1588   // XXX: based on perf_event_poller_multi()
1589   // in kernel tools/testing/selftests/bpf/trace_helpers.c
1590
1591   enum bpf_perf_event_ret ret;
1592   void *data = NULL;
1593   size_t len = 0;
1594
1595   unsigned ncpus
1596     = map_attrs[bpf::globals::perf_event_map_idx].max_entries;
1597   unsigned n_active_cpus
1598     = count_active_cpus();
1599   struct pollfd *pmu_fds
1600     = (struct pollfd *)malloc(n_active_cpus * sizeof(struct pollfd));
1601   vector<unsigned> cpuids;
1602
1603   assert(ncpus == perf_fds.size());
1604   unsigned i = 0;
1605   for (unsigned cpu = 0; cpu < ncpus; cpu++)
1606     {
1607       if (!cpu_online[cpu]) continue; // -- skip inactive CPUs.
1608
1609       pmu_fds[i].fd = perf_fds[cpu];
1610       pmu_fds[i].events = POLLIN;
1611       cpuids.push_back(cpu);
1612       i++;
1613     }
1614   assert(n_active_cpus == cpuids.size());
1615
1616   // Avoid multiple warnings about errors reading from an fd:
1617   std::set<int> already_warned;
1618
1619   for (;;)
1620     {
1621       if (log_level > 3)
1622         fprintf(stderr, "Polling for perf_event data on %d cpus...\n", n_active_cpus);
1623       int ready = poll(pmu_fds, n_active_cpus, 1000); // XXX: Consider setting timeout -1 (unlimited).
1624       if (ready < 0 && errno == EINTR)
1625         goto signal_exit;
1626       if (ready < 0)
1627         fatal("Error checking for perf events: %s\n", strerror(errno));
1628       for (unsigned i = 0; i < n_active_cpus; i++)
1629         {
1630           if (pmu_fds[i].revents <= 0)
1631             continue;
1632           if (log_level > 3)
1633             fprintf(stderr, "Saw perf_event on fd %d\n", pmu_fds[i].fd);
1634
1635           ready --;
1636           unsigned cpu = cpuids[i];
1637           ret = bpf_perf_event_read_simple
1638             (perf_headers[cpu],
1639              perf_event_page_count * perf_event_page_size,
1640              perf_event_page_size,
1641              &data, &len,
1642              perf_event_handle, transport_contexts[cpu]);
1643
1644           if (ret == LIBBPF_PERF_EVENT_DONE)
1645             {
1646               // Saw STP_EXIT message. If the exit flag is set,
1647               // wake up main thread to begin program shutdown.
1648               if (get_exit_status())
1649                   goto signal_exit;
1650               continue;
1651             }
1652           if (ret != LIBBPF_PERF_EVENT_CONT)
1653             if (already_warned.count(pmu_fds[i].fd) == 0)
1654               {
1655                 fprintf(stderr, "WARNING: could not read from perf_event buffer on fd %d\n", pmu_fds[i].fd);
1656                 already_warned.insert(pmu_fds[i].fd);
1657               }
1658         }
1659       assert(ready == 0);
1660     }
1661
1662  signal_exit:
1663   pthread_kill(main_thread, SIGINT);
1664   free(pmu_fds);
1665   return;
1666 }
1667
1668 static void
1669 usage(const char *argv0)
1670 {
1671   printf("Usage: %s [-v][-w][-V][-h] [-o FILE] <bpf-file>\n"
1672          "  -h, --help       Show this help text\n"
1673          "  -v, --verbose    Increase verbosity\n"
1674          "  -V, --version    Show version\n"
1675          "  -w               Suppress warnings\n"
1676          "  -x pid           Sets the '_stp_target' variable to pid.\n"
1677          "  -o FILE          Send output to FILE\n",
1678          argv0);
1679 }
1680
1681
1682 void
1683 sigint(int s)
1684 {
1685   // suppress any subsequent SIGINTs that may come from stap parent process
1686   signal(s, SIG_IGN);
1687
1688   // during the exit phase, ^C should exit immediately
1689   if (exit_phase)
1690     {
1691       if (!interrupt_message) // avoid duplicate message
1692         fprintf(stderr, "received interrupt during exit probe\n");
1693       interrupt_message = 1;
1694       abort();
1695     }
1696
1697   // set exit flag
1698   int key = bpf::globals::EXIT;
1699   long val = 1;
1700
1701   if (bpf_update_elem
1702        (map_fds[bpf::globals::internal_map_idx], &key, &val, 0) != 0)
1703      fatal("error during bpf map update: %s\n", strerror(errno));
1704 }
1705
1706 int
1707 main(int argc, char **argv)
1708 {
1709   static const option long_opts[] = {
1710     { "help", 0, NULL, 'h' },
1711     { "verbose", 0, NULL, 'v' },
1712     { "version", 0, NULL, 'V' },
1713   };
1714
1715   int rc;
1716
1717   while ((rc = getopt_long(argc, argv, "hvVwx:o:", long_opts, NULL)) >= 0)
1718     switch (rc)
1719       {
1720       case 'v':
1721         log_level++;
1722         break;
1723       case 'w':
1724         warnings = 0;
1725         break;
1726
1727       case 'x':
1728         target_pid = atoi(optarg);
1729         break;
1730
1731       case 'o':
1732         output_f = fopen(optarg, "w");
1733         if (output_f == NULL)
1734           {
1735             fprintf(stderr, "Error opening %s for output: %s\n",
1736                     optarg, strerror(errno));
1737             return 1;
1738           }
1739         break;
1740
1741       case 'V':
1742         printf("Systemtap BPF loader/runner (version %s, %s)\n"
1743                "Copyright (C) 2016-2019 Red Hat, Inc. and others\n" // PRERELEASE
1744                "This is free software; "
1745                "see the source for copying conditions.\n",
1746                VERSION, STAP_EXTENDED_VERSION);
1747         return 0;
1748
1749       case 'h':
1750         usage(argv[0]);
1751         return 0;
1752
1753       default:
1754       do_usage:
1755         usage(argv[0]);
1756         return 1;
1757       }
1758   if (optind != argc - 1)
1759     goto do_usage;
1760
1761   // Be sure dmesg mentions that we are loading bpf programs:
1762   kmsg = fopen("/dev/kmsg", "w");
1763   if (kmsg == NULL)
1764     fprintf(stderr, "WARNING: could not open /dev/kmsg for diagnostics: %s\n", strerror(errno));
1765
1766   load_bpf_file(argv[optind]); // <- XXX initializes cpus online, PR24543 initializes default_cpu
1767   init_internal_globals();
1768   init_perf_transport();
1769
1770   // Create a bpf_transport_context for userspace programs:
1771   unsigned ncpus = map_attrs[bpf::globals::perf_event_map_idx].max_entries;
1772   bpf_transport_context uctx(default_cpu, -1/*pmu_fd*/, ncpus,
1773                              map_attrs, &map_fds, output_f,
1774                              &interned_strings, &aggregates);
1775
1776   if (create_group_fds() < 0)
1777     fatal("Error creating perf event group: %s\n", strerror(errno));
1778
1779   register_kprobes();
1780   register_uprobes();
1781   register_timers();
1782   register_tracepoints();
1783   register_perf();
1784
1785   // Run the begin probes.
1786   if (prog_begin)
1787     bpf_interpret(prog_begin->d_size / sizeof(bpf_insn),
1788                   static_cast<bpf_insn *>(prog_begin->d_buf),
1789                   &uctx);
1790
1791   // Wait for ^C; read BPF_OUTPUT events, copying them to output_f.
1792   signal(SIGINT, (sighandler_t)sigint);
1793   signal(SIGTERM, (sighandler_t)sigint);
1794
1795   // PR22330: Listen for perf_events:
1796   std::thread(perf_event_loop, pthread_self()).detach();
1797
1798   // Now that the begin probe has run and the perf_event listener is active, enable the kprobes.
1799   ioctl(group_fd, PERF_EVENT_IOC_ENABLE, 0);
1800
1801   // Wait for STP_EXIT message:
1802   while (!get_exit_status())
1803     pause();
1804
1805   // Disable the kprobes before deregistering and running exit probes.
1806   ioctl(group_fd, PERF_EVENT_IOC_DISABLE, 0);
1807   close(group_fd);
1808
1809   // Unregister all probes.
1810   unregister_kprobes(kprobes.size());
1811   unregister_uprobes(uprobes.size());
1812   unregister_timers(timers.size());
1813   unregister_perf(perf_probes.size());
1814   unregister_tracepoints(tracepoint_probes.size());
1815
1816   // We are now running exit probes, so ^C should exit immediately:
1817   exit_phase = 1;
1818   signal(SIGINT, (sighandler_t)sigint); // restore previously ignored signal
1819   signal(SIGTERM, (sighandler_t)sigint);
1820
1821   // Run the end+error probes.
1822   if (prog_end)
1823     bpf_interpret(prog_end->d_size / sizeof(bpf_insn),
1824                   static_cast<bpf_insn *>(prog_end->d_buf),
1825                   &uctx);
1826
1827   // Clean up transport layer allocations:
1828   for (std::vector<bpf_transport_context *>::iterator it = transport_contexts.begin();
1829        it != transport_contexts.end(); it++)
1830     delete *it;
1831
1832   elf_end(module_elf);
1833   fclose(kmsg);
1834   return 0;
1835 }