]> sourceware.org Git - systemtap.git/blob - stapbpf/stapbpf.cxx
stapbpf PR24543 oops in previous commit (mark_active_cpus)
[systemtap.git] / stapbpf / stapbpf.cxx
1 /* stapbpf.cxx - SystemTap BPF loader
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
15 *
16 * Copyright (C) 2016-2019 Red Hat, Inc.
17 *
18 */
19
20 #include <cstdio>
21 #include <cstdlib>
22 #include <cstring>
23 #include <cstdarg>
24 #include <cassert>
25 #include <csignal>
26 #include <cerrno>
27 #include <fstream>
28 #include <sstream>
29 #include <string>
30 #include <thread>
31 #include <vector>
32 #include <unistd.h>
33 #include <limits.h>
34 #include <inttypes.h>
35 #include <getopt.h>
36 #include <sys/fcntl.h>
37 #include <sys/ioctl.h>
38 #include <sys/syscall.h>
39 #include <sys/mman.h>
40 #include <sys/utsname.h>
41 #include <sys/resource.h>
42 #include "bpfinterp.h"
43
44 extern "C" {
45 #include <linux/bpf.h>
46 #include <linux/perf_event.h>
47 /* Introduced in 4.1. */
48 #ifndef PERF_EVENT_IOC_SET_BPF
49 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
50 #endif
51 #include <libelf.h>
52 }
53
54 #include "config.h"
55 #include "../git_version.h"
56 #include "../version.h"
57 #include "../bpf-internal.h"
58
59 #ifndef EM_BPF
60 #define EM_BPF 0xeb9f
61 #endif
62 #ifndef R_BPF_MAP_FD
63 #define R_BPF_MAP_FD 1
64 #endif
65
66 using namespace std;
67
68 static int group_fd = -1; // ??? Need one per cpu.
69 extern "C" {
70 int log_level = 0;
71 };
72 int target_pid = 0;
73 static int warnings = 1;
74 static int exit_phase = 0;
75 static int interrupt_message = 0;
76 static FILE *output_f = stdout;
77 static FILE *kmsg = NULL;
78
79 static const char *module_name;
80 static const char *module_basename;
81 static const char *script_name; // name of original systemtap script
82 static const char *module_license;
83 static Elf *module_elf;
84
85 static uint32_t kernel_version;
86
87 // Sized by the contents of the "maps" section.
88 static bpf_map_def *map_attrs;
89 static std::vector<int> map_fds;
90
91 // PR24543: Some perf constructs must be anchored to a single CPU.
92 // Normally we use cpu0, but it could (in very rare cases) be disabled.
93 // Initialized in mark_active_cpus() along with cpu_online.
94 static int default_cpu = 0;
95
96 // Sized by the number of CPUs:
97 static std::vector<int> perf_fds;
98 static std::vector<bool> cpu_online; // -- is CPU active?
99 static std::vector<struct perf_event_mmap_page *> perf_headers;
100 static std::vector<bpf_transport_context *> transport_contexts;
101
102 // Additional info for perf_events transport:
103 static int perf_event_page_size;
104 static int perf_event_page_count = 8;
105 static int perf_event_mmap_size;
106
107 // Table of interned strings:
108 static std::vector<std::string> interned_strings;
109
110 // Table of map id's for statistical aggregates:
111 static std::unordered_map<bpf::globals::agg_idx, bpf::globals::stats_map> aggregates;
112
113 // XXX: Required static data and methods from bpf::globals, shared with translator.
114 #include "../bpf-shared-globals.h"
115
116 // Sized by the number of sections, so that we can easily
117 // look them up by st_shndx.
118 static std::vector<int> prog_fds;
119
120 // Programs to run at begin and end of execution.
121 static Elf_Data *prog_begin;
122 static Elf_Data *prog_end;
123
124 #define DEBUGFS "/sys/kernel/debug/tracing/"
125 #define KPROBE_EVENTS DEBUGFS "kprobe_events"
126 #define UPROBE_EVENTS DEBUGFS "uprobe_events"
127 #define EVENTS DEBUGFS "events"
128
129 #define CPUFS "/sys/devices/system/cpu/"
130 #define CPUS_ONLINE CPUFS "online"
131 #define CPUS_POSSIBLE CPUFS "possible"
132
133 static void unregister_kprobes(const size_t nprobes);
134
135 struct kprobe_data
136 {
137 string args;
138 char type;
139 int prog_fd;
140 int event_id;
141 int event_fd; // ??? Need one per cpu.
142
143 kprobe_data(char t, string s, int fd)
144 : args(s), type(t), prog_fd(fd), event_id(-1), event_fd(-1)
145 { }
146 };
147
148 struct uprobe_data
149 {
150 string path;
151 char type;
152 int pid;
153 unsigned long long offset;
154 int prog_fd;
155 int event_id;
156 int event_fd;
157
158 uprobe_data(string path, char t, int pid, unsigned long long off, int fd)
159 : path(path), type(t), pid(pid), offset(off), prog_fd(fd),
160 event_id(-1), event_fd(-1)
161 { }
162 };
163
164 struct timer_data
165 {
166 unsigned long period;
167 int prog_fd;
168 int event_fd;
169
170 timer_data(unsigned long period, int fd)
171 : period(period), prog_fd(fd), event_fd(-1)
172 { }
173 };
174
175 struct perf_data
176 {
177 int event_type;
178 int event_config;
179 bool has_freq;
180 unsigned long interval;
181 int prog_fd;
182 int event_fd;
183
184 perf_data(int type, int config, bool freq, unsigned long interval, int fd)
185 : event_type(type), event_config(config), has_freq(freq),
186 interval(interval), prog_fd(fd), event_fd(-1)
187 { }
188 };
189
190 struct trace_data
191 {
192 string system;
193 string name;
194 int prog_fd;
195 int event_id;
196 int event_fd;
197
198 trace_data(char *s, char *n, int fd)
199 : system(s), name(n), prog_fd(fd), event_id(-1), event_fd(-1)
200 { }
201 };
202
203 static std::vector<kprobe_data> kprobes;
204 static std::vector<timer_data> timers;
205 static std::vector<perf_data> perf_probes;
206 static std::vector<trace_data> tracepoint_probes;
207 static std::vector<uprobe_data> uprobes;
208
209 // TODO: Move fatal() to bpfinterp.h and replace abort() calls in the interpreter.
210 // TODO: Add warn() option.
211 static void __attribute__((noreturn))
212 fatal(const char *str, ...)
213 {
214 if (module_name)
215 fprintf(stderr, "Error loading %s: ", module_name);
216
217 va_list va;
218 va_start(va, str);
219 vfprintf(stderr, str, va);
220 va_end(va);
221
222 exit(1);
223 }
224
225 static void
226 fatal_sys()
227 {
228 fatal("%s\n", strerror(errno));
229 }
230
231 static void
232 fatal_elf()
233 {
234 fatal("%s\n", elf_errmsg(-1));
235 }
236
237
238 // XXX: based on get_online_cpus()/read_cpu_range()
239 // in bcc src/cc/common.cc
240 //
241 // PR24543: Also sets default_cpu.
242 //
243 // This is the only way I know of so far, so I have to imitate it for
244 // now. Parsing a /sys/devices diagnostic file seems a bit brittle to
245 // me, though.
246 static void
247 mark_active_cpus(unsigned ncpus)
248 {
249 std::ifstream cpu_ranges(CPUS_ONLINE);
250 std::string cpu_range;
251
252 // XXX if cpu0 is offline
253 int alternate_cpu = -1;
254 bool found_alternate = false;
255
256 cpu_online.clear();
257 for (unsigned i = 0; i < ncpus; i++)
258 cpu_online.push_back(false);
259
260 while (std::getline(cpu_ranges, cpu_range, ','))
261 {
262 size_t rangepos = cpu_range.find("-");
263 int start, end;
264 if (rangepos == std::string::npos)
265 {
266 start = end = std::stoi(cpu_range);
267 }
268 else
269 {
270 start = std::stoi(cpu_range.substr(0, rangepos));
271 end = std::stoi(cpu_range.substr(rangepos+1));
272 }
273 for (int i = start; i <= end; i++)
274 {
275 if (!found_alternate)
276 {
277 alternate_cpu = i;
278 found_alternate = true;
279 }
280 cpu_online[i] = true;
281 }
282 }
283
284 // PR24543: Make sure default_cpu is active.
285 if (!cpu_online[default_cpu] && found_alternate)
286 default_cpu = alternate_cpu;
287 }
288
289 static int
290 count_active_cpus()
291 {
292 int count = 0;
293 for (unsigned cpu = 0; cpu < cpu_online.size(); cpu++)
294 if (cpu_online[cpu])
295 count++;
296 return count;
297 }
298
299 static int
300 create_group_fds()
301 {
302 perf_event_attr peattr;
303
304 memset(&peattr, 0, sizeof(peattr));
305 peattr.size = sizeof(peattr);
306 peattr.disabled = 1;
307 peattr.type = PERF_TYPE_SOFTWARE;
308 peattr.config = PERF_COUNT_SW_DUMMY;
309
310 return group_fd = perf_event_open(&peattr, -1, default_cpu, -1, 0);
311 }
312
313 static void
314 instantiate_maps (Elf64_Shdr *shdr, Elf_Data *data)
315 {
316 if (shdr->sh_entsize != sizeof(bpf_map_def))
317 fatal("map entry size mismatch (%zu != %zu)\n",
318 (size_t)shdr->sh_entsize, sizeof(bpf_map_def));
319
320 size_t i, n = shdr->sh_size / sizeof(bpf_map_def);
321 struct bpf_map_def *attrs = static_cast<bpf_map_def *>(data->d_buf);
322
323 map_attrs = attrs;
324 map_fds.assign(n, -1);
325
326 // XXX: PR24324 -- This overhead space calculation was too
327 // conservative and caused resource exhaustion errors, disabling it
328 // until we figure out how much space we need or if the
329 // RLIM_INFINITY solution below is adequate.
330 #if 0
331 /* First, make room for the maps in this process' RLIMIT_MEMLOCK: */
332 size_t rlimit_increase = 0;
333 for (i = 0; i < n; ++i)
334 {
335 // TODO: The 58 bytes of overhead space per entry has been
336 // decided by trial and error, and may require further tweaking:
337 rlimit_increase += (58 + attrs[i].key_size + attrs[i].value_size) * attrs[i].max_entries;
338 // TODO: Note that Certain Other Tools just give up on
339 // calculating and set rlimit to the maximum possible.
340 }
341 #endif
342
343 struct rlimit curr_rlimit;
344 int rc;
345
346 rc = getrlimit(RLIMIT_MEMLOCK, &curr_rlimit);
347 if (rc < 0)
348 fatal("could not get map resource limit: %s\n",
349 strerror(errno));
350
351 rlim_t rlim_orig = curr_rlimit.rlim_cur;
352 rlim_t rlim_max_orig = curr_rlimit.rlim_max;
353 #if 0
354 curr_rlimit.rlim_cur += rlimit_increase;
355 curr_rlimit.rlim_max += rlimit_increase;
356 if (curr_rlimit.rlim_cur < rlim_orig) // handle overflow
357 curr_rlimit.rlim_cur = rlim_orig;
358 if (curr_rlimit.rlim_max < rlim_max_orig) // handle overflow
359 curr_rlimit.rlim_max = rlim_max_orig;
360 #endif
361 // TODOXXX: PR24324 -- EXPERIMENTAL fix for aggressive resource limits.
362 // Other Tools do something like this but it doesn't solve all our problems.
363 curr_rlimit.rlim_cur = RLIM_INFINITY;
364 curr_rlimit.rlim_max = RLIM_INFINITY;
365
366 rc = setrlimit(RLIMIT_MEMLOCK, &curr_rlimit);
367 if (rc < 0)
368 fatal("could not increase map resource limit -- "
369 "cur from %lu to %lu, max from %lu to %lu: %s\n",
370 rlim_orig, curr_rlimit.rlim_cur,
371 rlim_max_orig, curr_rlimit.rlim_max,
372 strerror(errno));
373 if (log_level > 1)
374 {
375 fprintf(stderr, "increasing map cur resource limit from %lu to %lu\n",
376 rlim_orig, curr_rlimit.rlim_cur);
377 fprintf(stderr, "increasing map max resource limit from %lu to %lu\n",
378 rlim_max_orig, curr_rlimit.rlim_max);
379 }
380
381 /* Now create the maps: */
382 for (i = 0; i < n; ++i)
383 {
384 /* PR22330: The perf_event_map used for message transport must
385 have max_entries equal to the number of active CPUs, which we
386 wouldn't know for sure at translate time. Set it now: */
387 bpf_map_type map_type = static_cast<bpf_map_type>(attrs[i].type);
388 if (map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY)
389 {
390 /* XXX: Assume our only perf_event_map is the percpu transport one: */
391 assert(i == bpf::globals::perf_event_map_idx);
392 assert(attrs[i].max_entries == bpf::globals::NUM_CPUS_PLACEHOLDER);
393
394 // TODO: perf_event buffers can only be created for currently
395 // active CPUs. For now we imitate Certain Other Tools and
396 // create perf_events for CPUs that are active at startup time
397 // (while sizing the perf_event_map according to total CPUs).
398 // But for full coverage, we really need to listen to CPUs
399 // coming on/offline and adjust accordingly.
400 long ncpus_ = sysconf(_SC_NPROCESSORS_CONF);
401 unsigned ncpus = ncpus_ > 0 ? ncpus_ : 1;
402 if (ncpus_ < 0)
403 fprintf(stderr, "WARNING: could not get number of CPUs, falling back to 1: %s\n", strerror(errno));
404 else if (ncpus_ == 0)
405 fprintf(stderr, "WARNING: could not get number of CPUs, falling back to 1\n"); // XXX no errno
406 //unsigned ncpus = get_nprocs_conf();
407 mark_active_cpus((unsigned)ncpus);
408 attrs[i].max_entries = ncpus;
409 }
410
411 if (log_level > 2)
412 fprintf(stderr, "creating map type %u entry %zu: key_size %u, value_size %u, "
413 "max_entries %u, map_flags %u\n", map_type, i,
414 attrs[i].key_size, attrs[i].value_size,
415 attrs[i].max_entries, attrs[i].map_flags);
416 int fd = bpf_create_map(static_cast<bpf_map_type>(attrs[i].type),
417 attrs[i].key_size, attrs[i].value_size,
418 attrs[i].max_entries, attrs[i].map_flags);
419 if (fd < 0)
420 fatal("map entry %zu: %s\n", i, strerror(errno));
421 map_fds[i] = fd;
422 }
423 }
424
425 static int
426 prog_load(Elf_Data *data, const char *name)
427 {
428 enum bpf_prog_type prog_type;
429
430 if (strncmp(name, "kprobe", 6) == 0)
431 prog_type = BPF_PROG_TYPE_KPROBE;
432 else if (strncmp(name, "kretprobe", 9) == 0)
433 prog_type = BPF_PROG_TYPE_KPROBE;
434 else if (strncmp(name, "uprobe", 6) == 0)
435 prog_type = BPF_PROG_TYPE_KPROBE;
436 else if (strncmp(name, "timer", 5) == 0)
437 prog_type = BPF_PROG_TYPE_PERF_EVENT;
438 else if (strncmp(name, "trace", 5) == 0)
439 prog_type = BPF_PROG_TYPE_TRACEPOINT;
440 else if (strncmp(name, "perf", 4) == 0)
441 {
442 if (name[5] == '2' && name[6] == '/')
443 prog_type = BPF_PROG_TYPE_TRACEPOINT;
444 else
445 prog_type = BPF_PROG_TYPE_PERF_EVENT;
446 }
447 else
448 fatal("unhandled program type for section \"%s\"\n", name);
449
450 if (data->d_size % sizeof(bpf_insn))
451 fatal("program size not a multiple of %zu\n", sizeof(bpf_insn));
452
453 if (kmsg != NULL)
454 {
455 fprintf (kmsg, "%s (%s): stapbpf: %s, name: %s, d_size: %lu\n",
456 module_basename, script_name, VERSION, name, (unsigned long)data->d_size);
457 fflush (kmsg); // Otherwise, flush will only happen after the prog runs.
458 }
459 int fd = bpf_prog_load(prog_type, static_cast<bpf_insn *>(data->d_buf),
460 data->d_size, module_license, kernel_version);
461 if (fd < 0)
462 {
463 if (bpf_log_buf[0] != 0)
464 fatal("bpf program load failed: %s\n%s\n",
465 strerror(errno), bpf_log_buf);
466 else
467 fatal("bpf program load failed: %s\n", strerror(errno));
468 }
469 return fd;
470 }
471
472 static void
473 prog_relocate(Elf_Data *prog_data, Elf_Data *rel_data,
474 Elf_Data *sym_data, Elf_Data *str_data,
475 const char *prog_name, unsigned maps_idx, bool allocated)
476 {
477 bpf_insn *insns = static_cast<bpf_insn *>(prog_data->d_buf);
478 Elf64_Rel *rels = static_cast<Elf64_Rel *>(rel_data->d_buf);
479 Elf64_Sym *syms = static_cast<Elf64_Sym *>(sym_data->d_buf);
480
481 if (prog_data->d_size % sizeof(bpf_insn))
482 fatal("program size not a multiple of %zu\n", sizeof(bpf_insn));
483 if (rel_data->d_type != ELF_T_REL
484 || rel_data->d_size % sizeof(Elf64_Rel))
485 fatal("invalid reloc metadata\n");
486 if (sym_data->d_type != ELF_T_SYM
487 || sym_data->d_size % sizeof(Elf64_Sym))
488 fatal("invalid symbol metadata\n");
489
490 size_t psize = prog_data->d_size;
491 size_t nrels = rel_data->d_size / sizeof(Elf64_Rel);
492 size_t nsyms = sym_data->d_size / sizeof(Elf64_Sym);
493
494 for (size_t i = 0; i < nrels; ++i)
495 {
496 uint32_t sym = ELF64_R_SYM(rels[i].r_info);
497 uint32_t type = ELF64_R_TYPE(rels[i].r_info);
498 unsigned long long r_ofs = rels[i].r_offset;
499 size_t fd_idx;
500
501 if (type != R_BPF_MAP_FD)
502 fatal("invalid relocation type %u\n", type);
503 if (sym >= nsyms)
504 fatal("invalid symbol index %u\n", sym);
505 if (r_ofs >= psize || r_ofs % sizeof(bpf_insn))
506 fatal("invalid relocation offset at %s+%llu\n", prog_name, r_ofs);
507
508 if (sym >= nsyms)
509 fatal("invalid relocation symbol %u\n", sym);
510 if (syms[sym].st_shndx != maps_idx
511 || syms[sym].st_value % sizeof(bpf_map_def)
512 || (fd_idx = syms[sym].st_value / sizeof(bpf_map_def),
513 fd_idx >= map_fds.size()))
514 {
515 const char *name = "";
516 if (syms[sym].st_name < str_data->d_size)
517 name = static_cast<char *>(str_data->d_buf) + syms[sym].st_name;
518 if (*name)
519 fatal("symbol %s does not reference a map\n", name);
520 else
521 fatal("symbol %u does not reference a map\n", sym);
522 }
523
524 bpf_insn *insn = insns + (r_ofs / sizeof(bpf_insn));
525 if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
526 fatal("invalid relocation insn at %s+%llu\n", prog_name, r_ofs);
527
528 insn->src_reg = BPF_PSEUDO_MAP_FD;
529 insn->imm = (allocated ? map_fds[fd_idx] : fd_idx);
530 }
531 }
532
533 static void
534 maybe_collect_kprobe(const char *name, unsigned name_idx,
535 unsigned fd_idx, Elf64_Addr offset)
536 {
537 char type;
538 string arg;
539
540 if (strncmp(name, "kprobe/", 7) == 0)
541 {
542 string line;
543 const char *stext = NULL;
544 type = 'p';
545 name += 7;
546
547 ifstream syms("/proc/kallsyms");
548 if (!syms)
549 fatal("error opening /proc/kallsyms: %s\n", strerror(errno));
550
551 // get value of symbol _stext and add it to the offset found in name.
552 while (getline(syms, line))
553 {
554 const char *l = line.c_str();
555 if (strncmp(l + 19, "_stext", 6) == 0)
556 {
557 stext = l;
558 break;
559 }
560 }
561
562 if (stext == NULL)
563 fatal("could not find _stext in /proc/kallsyms");
564
565 unsigned long addr = strtoul(stext, NULL, 16);
566 addr += strtoul(name, NULL, 16);
567 stringstream ss;
568 ss << "0x" << hex << addr;
569 arg = ss.str();
570 }
571 else if (strncmp(name, "kretprobe/", 10) == 0)
572 type = 'r', arg = name + 10;
573 else
574 return;
575
576 int fd = -1;
577 if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
578 fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
579 if (offset != 0)
580 fatal("probe %u offset non-zero\n", name_idx);
581
582 kprobes.push_back(kprobe_data(type, arg, fd));
583 }
584
585 static void
586 collect_uprobe(const char *name, unsigned name_idx, unsigned fd_idx)
587 {
588 char type = '\0';
589 int pid = -1;
590 unsigned long long off = 0;
591 char path[PATH_MAX];
592
593 int res = sscanf(name, "uprobe/%c/%d/%llu%s", &type, &pid, &off, path);
594
595 if (!pid)
596 pid = -1; // indicates to perf_event_open that we're tracing all processes
597
598 if (res != 4)
599 fatal("unable to parse name of probe %u section %u\n", name_idx, fd_idx);
600
601 int fd = -1;
602 if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
603 fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
604
605 uprobes.push_back(uprobe_data(std::string(path), type, pid, off, fd));
606 }
607
608 static void
609 collect_perf(const char *name, unsigned name_idx, unsigned fd_idx)
610 {
611 char has_freq;
612 int event_type;
613 int event_config;
614 unsigned long interval;
615
616 int res = sscanf(name, "perf/%d/%d/%c/%lu",
617 &event_type, &event_config, &has_freq, &interval);
618 if (res != 4)
619 fatal("unable to parse name of probe %u section %u\n", name_idx, fd_idx);
620
621 int fd = -1;
622 if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
623 fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
624
625 perf_probes.push_back(
626 perf_data(event_type, event_config, has_freq == 'f', interval, fd));
627 }
628
629 static void
630 collect_timer(const char *name, unsigned name_idx, unsigned fd_idx)
631 {
632 unsigned long period = strtoul(name + 11, NULL, 10);
633
634 if (strncmp(name + 6, "jiff/", 5) == 0)
635 {
636 long jiffies_per_sec = sysconf(_SC_CLK_TCK);
637 period *= 1e9 / jiffies_per_sec;
638 }
639
640 int fd = -1;
641 if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
642 fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
643
644 timers.push_back(timer_data(period, fd));
645 return;
646 }
647
648 static void
649 collect_tracepoint(const char *name, unsigned name_idx, unsigned fd_idx)
650 {
651 char tp_system[512];
652 char tp_name[512];
653
654 int res = sscanf(name, "trace/%[^/]/%s", tp_system, tp_name);
655 if (res != 2 || strlen(name) > 512)
656 fatal("unable to parse name of probe %u section %u\n", name_idx, fd_idx);
657
658 int fd = -1;
659 if (fd_idx >= prog_fds.size() || (fd = prog_fds[fd_idx]) < 0)
660 fatal("probe %u section %u not loaded\n", name_idx, fd_idx);
661
662 tracepoint_probes.push_back(trace_data(tp_system, tp_name, fd));
663 }
664
665 static void
666 kprobe_collect_from_syms(Elf_Data *sym_data, Elf_Data *str_data)
667 {
668 Elf64_Sym *syms = static_cast<Elf64_Sym *>(sym_data->d_buf);
669 size_t nsyms = sym_data->d_type / sizeof(Elf64_Sym);
670
671 if (sym_data->d_type != ELF_T_SYM
672 || sym_data->d_size % sizeof(Elf64_Sym))
673 fatal("invalid kprobes symbol metadata\n");
674
675 for (size_t i = 0; i < nsyms; ++i)
676 {
677 const char *name;
678 if (syms[i].st_name < str_data->d_size)
679 name = static_cast<char *>(str_data->d_buf) + syms[i].st_name;
680 else
681 fatal("symbol %u has invalid string index\n", i);
682 maybe_collect_kprobe(name, i, syms[i].st_shndx, syms[i].st_value);
683 }
684 }
685
686 static void
687 unregister_uprobes(const size_t nprobes)
688 {
689 if (nprobes == 0)
690 return;
691
692 int fd = open(DEBUGFS "uprobe_events", O_WRONLY);
693 if (fd < 0)
694 return;
695
696
697 const int pid = getpid();
698 for (size_t i = 0; i < nprobes; ++i)
699 {
700 close(uprobes[i].event_fd);
701
702 char msgbuf[128];
703 ssize_t olen = snprintf(msgbuf, sizeof(msgbuf), "-:stapprobe_%d_%zu",
704 pid, i);
705 ssize_t wlen = write(fd, msgbuf, olen);
706 if (wlen < 0)
707 fprintf(stderr, "Error removing probe %zu: %s\n",
708 i, strerror(errno));
709 }
710 close(fd);
711 }
712
713 static void
714 register_uprobes()
715 {
716 size_t nprobes = uprobes.size();
717 if (nprobes == 0)
718 return;
719
720 int fd = open(UPROBE_EVENTS, O_WRONLY);
721 if (fd < 0)
722 fatal("Error opening %s: %s\n", UPROBE_EVENTS, strerror(errno));
723
724 const int pid = getpid();
725
726 for (size_t i = 0; i < nprobes; ++i)
727 {
728 uprobe_data &u = uprobes[i];
729 char msgbuf[PATH_MAX];
730
731 ssize_t olen = snprintf(msgbuf, sizeof(msgbuf), "%c:stapprobe_%d_%zu %s:0x%llx",
732 u.type, pid, i, u.path.c_str(), u.offset);
733 if ((size_t)olen >= sizeof(msgbuf))
734 {
735 fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
736 if (i == 0)
737 goto fail_0;
738 nprobes = i - 1;
739 goto fail_n;
740 }
741
742 if (log_level > 1)
743 fprintf(stderr, "Associating probe %zu with uprobe %s\n", i, msgbuf);
744
745 ssize_t wlen = write(fd, msgbuf, olen);
746 if (wlen != olen)
747 {
748 fprintf(stderr, "Error creating probe %zu: %s\n",
749 i, strerror(errno));
750 if (i == 0)
751 goto fail_0;
752 nprobes = i - 1;
753 goto fail_n;
754 }
755 }
756 close(fd);
757
758 for (size_t i = 0; i < nprobes; ++i)
759 {
760 char fnbuf[PATH_MAX];
761 ssize_t len = snprintf(fnbuf, sizeof(fnbuf),
762 DEBUGFS "events/uprobes/stapprobe_%d_%zu/id", pid, i);
763 if ((size_t)len >= sizeof(bpf_log_buf))
764 {
765 fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
766 goto fail_n;
767 }
768
769 fd = open(fnbuf, O_RDONLY);
770 if (fd < 0)
771 {
772 fprintf(stderr, "Error opening probe event id %zu: %s\n",
773 i, strerror(errno));
774 goto fail_n;
775 }
776
777 char msgbuf[128];
778 len = read(fd, msgbuf, sizeof(msgbuf) - 1);
779 if (len < 0)
780 {
781 fprintf(stderr, "Error reading probe event id %zu: %s\n",
782 i, strerror(errno));
783 goto fail_n;
784 }
785 close(fd);
786
787 msgbuf[len] = 0;
788 uprobes[i].event_id = atoi(msgbuf);
789 }
790
791 // ??? Iterate to enable on all cpus, each with a different group_fd.
792 {
793 perf_event_attr peattr;
794
795 memset(&peattr, 0, sizeof(peattr));
796 peattr.size = sizeof(peattr);
797 peattr.type = PERF_TYPE_TRACEPOINT;
798 peattr.sample_type = PERF_SAMPLE_RAW;
799 peattr.sample_period = 1;
800 peattr.wakeup_events = 1;
801
802 for (size_t i = 0; i < nprobes; ++i)
803 {
804 uprobe_data &u = uprobes[i];
805 peattr.config = u.event_id;
806
807 fd = perf_event_open(&peattr, u.pid, default_cpu, -1, 0);
808 if (fd < 0)
809 {
810 fprintf(stderr, "Error opening probe id %zu: %s\n",
811 i, strerror(errno));
812 goto fail_n;
813 }
814 u.event_fd = fd;
815
816 if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, u.prog_fd) < 0)
817 {
818 fprintf(stderr, "Error installing bpf for probe id %zu: %s\n",
819 i, strerror(errno));
820 goto fail_n;
821 }
822 }
823 }
824 return;
825
826 fail_n:
827 unregister_uprobes(nprobes);
828 fail_0:
829 exit(1);
830 }
831
832 static void
833 register_kprobes()
834 {
835 size_t nprobes = kprobes.size();
836 if (nprobes == 0)
837 return;
838
839 int fd = open(KPROBE_EVENTS, O_WRONLY);
840 if (fd < 0)
841 fatal("Error opening %s: %s\n", KPROBE_EVENTS, strerror(errno));
842
843 const int pid = getpid();
844
845 for (size_t i = 0; i < nprobes; ++i)
846 {
847 kprobe_data &k = kprobes[i];
848 char msgbuf[128];
849
850 ssize_t olen = snprintf(msgbuf, sizeof(msgbuf), "%c:p%d_%zu %s",
851 k.type, pid, i, k.args.c_str());
852 if ((size_t)olen >= sizeof(msgbuf))
853 {
854 fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
855 if (i == 0)
856 goto fail_0;
857 nprobes = i - 1;
858 goto fail_n;
859 }
860
861 if (log_level > 1)
862 fprintf(stderr, "Associating probe %zu with kprobe %s\n", i, msgbuf);
863
864 ssize_t wlen = write(fd, msgbuf, olen);
865 if (wlen != olen)
866 {
867 fprintf(stderr, "Error creating probe %zu: %s\n",
868 i, strerror(errno));
869 if (i == 0)
870 goto fail_0;
871 nprobes = i - 1;
872 goto fail_n;
873 }
874 }
875 close(fd);
876
877 for (size_t i = 0; i < nprobes; ++i)
878 {
879 char fnbuf[PATH_MAX];
880 ssize_t len = snprintf(fnbuf, sizeof(fnbuf),
881 DEBUGFS "events/kprobes/p%d_%zu/id", pid, i);
882 if ((size_t)len >= sizeof(bpf_log_buf))
883 {
884 fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
885 goto fail_n;
886 }
887
888 fd = open(fnbuf, O_RDONLY);
889 if (fd < 0)
890 {
891 fprintf(stderr, "Error opening probe event id %zu: %s\n",
892 i, strerror(errno));
893 goto fail_n;
894 }
895
896 char msgbuf[128];
897 len = read(fd, msgbuf, sizeof(msgbuf) - 1);
898 if (len < 0)
899 {
900 fprintf(stderr, "Error reading probe event id %zu: %s\n",
901 i, strerror(errno));
902 goto fail_n;
903 }
904 close(fd);
905
906 msgbuf[len] = 0;
907 kprobes[i].event_id = atoi(msgbuf);
908 }
909
910 // ??? Iterate to enable on all cpus, each with a different group_fd.
911 {
912 perf_event_attr peattr;
913
914 memset(&peattr, 0, sizeof(peattr));
915 peattr.size = sizeof(peattr);
916 peattr.type = PERF_TYPE_TRACEPOINT;
917 peattr.sample_type = PERF_SAMPLE_RAW;
918 peattr.sample_period = 1;
919 peattr.wakeup_events = 1;
920
921 for (size_t i = 0; i < nprobes; ++i)
922 {
923 kprobe_data &k = kprobes[i];
924 peattr.config = k.event_id;
925
926 fd = perf_event_open(&peattr, -1, default_cpu, group_fd, 0);
927 if (fd < 0)
928 {
929 fprintf(stderr, "Error opening probe id %zu: %s\n",
930 i, strerror(errno));
931 goto fail_n;
932 }
933 k.event_fd = fd;
934
935 if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, k.prog_fd) < 0)
936 {
937 fprintf(stderr, "Error installing bpf for probe id %zu: %s\n",
938 i, strerror(errno));
939 goto fail_n;
940 }
941 }
942 }
943 return;
944
945 fail_n:
946 unregister_kprobes(nprobes);
947 fail_0:
948 exit(1);
949 }
950
951 static void
952 unregister_kprobes(const size_t nprobes)
953 {
954 if (nprobes == 0)
955 return;
956
957 int fd = open(DEBUGFS "kprobe_events", O_WRONLY);
958 if (fd < 0)
959 return;
960
961
962 const int pid = getpid();
963 for (size_t i = 0; i < nprobes; ++i)
964 {
965 close(kprobes[i].event_fd);
966
967 char msgbuf[128];
968 ssize_t olen = snprintf(msgbuf, sizeof(msgbuf), "-:p%d_%zu",
969 pid, i);
970 ssize_t wlen = write(fd, msgbuf, olen);
971 if (wlen < 0)
972 fprintf(stderr, "Error removing probe %zu: %s\n",
973 i, strerror(errno));
974 }
975 close(fd);
976 }
977
978 static void
979 unregister_tracepoints(const size_t nprobes)
980 {
981 for (size_t i = 0; i < nprobes; ++i)
982 close(tracepoint_probes[i].event_fd);
983 }
984
985 static void
986 register_tracepoints()
987 {
988 size_t nprobes = tracepoint_probes.size();
989 if (nprobes == 0)
990 return;
991
992 for (size_t i = 0; i < nprobes; ++i)
993 {
994 trace_data &t = tracepoint_probes[i];
995 char fnbuf[PATH_MAX];
996 ssize_t len = snprintf(fnbuf, sizeof(fnbuf),
997 DEBUGFS "events/%s/%s/id",
998 t.system.c_str(), t.name.c_str());
999 if ((size_t)len >= sizeof(bpf_log_buf))
1000 {
1001 fprintf(stderr, "Buffer overflow creating probe %zu\n", i);
1002 goto fail;
1003 }
1004
1005 int fd = open(fnbuf, O_RDONLY);
1006 if (fd < 0)
1007 {
1008 fprintf(stderr, "Error opening probe event id %zu: %s\n",
1009 i, strerror(errno));
1010
1011 if (errno == ENOENT)
1012 fprintf(stderr, "\"%s/%s\" could not be found in %s\n",
1013 t.system.c_str(), t.name.c_str(), EVENTS);
1014
1015 goto fail;
1016 }
1017
1018 char msgbuf[128];
1019 len = read(fd, msgbuf, sizeof(msgbuf) - 1);
1020 if (len < 0)
1021 {
1022 fprintf(stderr, "Error reading probe event id %zu: %s\n",
1023 i, strerror(errno));
1024 close(fd);
1025 goto fail;
1026 }
1027 close(fd);
1028
1029 msgbuf[len] = 0;
1030 t.event_id = atoi(msgbuf);
1031 }
1032
1033 // ??? Iterate to enable on all cpus, each with a different group_fd.
1034 {
1035 perf_event_attr peattr;
1036
1037 memset(&peattr, 0, sizeof(peattr));
1038 peattr.size = sizeof(peattr);
1039 peattr.type = PERF_TYPE_TRACEPOINT;
1040 peattr.sample_type = PERF_SAMPLE_RAW;
1041 peattr.sample_period = 1;
1042 peattr.wakeup_events = 1;
1043
1044 for (size_t i = 0; i < nprobes; ++i)
1045 {
1046 trace_data &t = tracepoint_probes[i];
1047 peattr.config = t.event_id;
1048
1049 int fd = perf_event_open(&peattr, -1, default_cpu, group_fd, 0);
1050 if (fd < 0)
1051 {
1052 fprintf(stderr, "Error opening probe id %zu: %s\n",
1053 i, strerror(errno));
1054 goto fail;
1055 }
1056 t.event_fd = fd;
1057
1058 if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, t.prog_fd) < 0)
1059 {
1060 fprintf(stderr, "Error installing bpf for probe id %zu: %s\n",
1061 i, strerror(errno));
1062 goto fail;
1063 }
1064 }
1065 }
1066 return;
1067
1068 fail:
1069 unregister_tracepoints(nprobes);
1070 exit(1);
1071 }
1072
1073 static void
1074 unregister_timers(const size_t nprobes)
1075 {
1076 for (size_t i = 0; i < nprobes; ++i)
1077 close(timers[i].event_fd);
1078 }
1079
1080 static void
1081 register_timers()
1082 {
1083 perf_event_attr peattr;
1084
1085 memset(&peattr, 0, sizeof(peattr));
1086 peattr.size = sizeof(peattr);
1087 peattr.type = PERF_TYPE_SOFTWARE;
1088 peattr.config = PERF_COUNT_SW_CPU_CLOCK;
1089
1090 for (size_t i = 0; i < timers.size(); ++i)
1091 {
1092 timer_data &t = timers[i];
1093 peattr.sample_period = t.period;
1094
1095 int fd = perf_event_open(&peattr, -1, default_cpu, group_fd, 0);
1096 if (fd < 0)
1097 {
1098 int err = errno;
1099 unregister_timers(timers.size());
1100 fatal("Error opening timer probe id %zu: %s\n", i + 1, strerror(err));
1101 }
1102
1103 t.event_fd = fd;
1104 if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, t.prog_fd) < 0)
1105 {
1106 int err = errno;
1107 unregister_timers(timers.size());
1108 fatal("Error installing bpf for timer probe id %zu: %s\n",
1109 i + 1, strerror(err));
1110 }
1111 }
1112
1113 return;
1114 }
1115
1116 static void
1117 unregister_perf(const size_t nprobes)
1118 {
1119 for (size_t i = 0; i < nprobes; ++i)
1120 close(perf_probes[i].event_fd);
1121 }
1122
1123 static void
1124 register_perf()
1125 {
1126 for (size_t i = 0; i < perf_probes.size(); ++i)
1127 {
1128 perf_data &p = perf_probes[i];
1129 perf_event_attr peattr;
1130
1131 memset(&peattr, 0, sizeof(peattr));
1132 peattr.size = sizeof(peattr);
1133 peattr.type = p.event_type;
1134 peattr.config = p.event_config;
1135
1136 if (p.has_freq)
1137 {
1138 peattr.freq = 1;
1139 peattr.sample_freq = p.interval;
1140 }
1141 else
1142 peattr.sample_period = p.interval;
1143
1144 // group_fd is not used since this event might have an
1145 // incompatible type/config.
1146 int fd = perf_event_open(&peattr, -1, default_cpu, -1, 0);
1147 if (fd < 0)
1148 {
1149 int err = errno;
1150 unregister_perf(perf_probes.size());
1151 fatal("Error opening perf probe id %zu: %s\n", i + 1, strerror(err));
1152 }
1153
1154 p.event_fd = fd;
1155 if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, p.prog_fd) < 0)
1156 {
1157 int err = errno;
1158 unregister_perf(perf_probes.size());
1159 fatal("Error installing bpf for perf probe id %zu: %s\n",
1160 i + 1, strerror(err));
1161 }
1162 }
1163 }
1164
1165 static void
1166 init_internal_globals()
1167 {
1168 using namespace bpf;
1169
1170 int key = globals::EXIT;
1171 long val = 0;
1172
1173 if (bpf_update_elem(map_fds[globals::internal_map_idx],
1174 (void*)&key, (void*)&val, BPF_ANY) != 0)
1175 fatal("Error updating pid: %s\n", strerror(errno));
1176
1177 }
1178
1179 // PR22330: Initialize perf_event_map and perf_fds.
1180 static void
1181 init_perf_transport()
1182 {
1183 using namespace bpf;
1184
1185 unsigned ncpus = map_attrs[globals::perf_event_map_idx].max_entries;
1186
1187 for (unsigned cpu = 0; cpu < ncpus; cpu++)
1188 {
1189 if (!cpu_online[cpu]) // -- skip inactive CPUs.
1190 {
1191 perf_fds.push_back(-1);
1192 transport_contexts.push_back(nullptr);
1193 continue;
1194 }
1195
1196 struct perf_event_attr peattr;
1197
1198 memset(&peattr, 0, sizeof(peattr));
1199 peattr.size = sizeof(peattr);
1200 peattr.sample_type = PERF_SAMPLE_RAW;
1201 peattr.type = PERF_TYPE_SOFTWARE;
1202 peattr.config = PERF_COUNT_SW_BPF_OUTPUT;
1203 peattr.sample_period = 1;
1204 peattr.wakeup_events = 1;
1205
1206 int pmu_fd = perf_event_open(&peattr, -1/*pid*/, cpu, -1/*group_fd*/, 0);
1207 if (pmu_fd < 0)
1208 fatal("Error initializing perf event for cpu %d: %s\n", cpu, strerror(errno));
1209 if (bpf_update_elem(map_fds[globals::perf_event_map_idx],
1210 (void*)&cpu, (void*)&pmu_fd, BPF_ANY) != 0)
1211 fatal("Error assigning perf event for cpu %d: %s\n", cpu, strerror(errno));
1212 ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
1213 perf_fds.push_back(pmu_fd);
1214
1215 // Create a data structure to track what's happening on each CPU:
1216 bpf_transport_context *ctx
1217 = new bpf_transport_context(cpu, pmu_fd, ncpus, map_attrs, &map_fds,
1218 output_f, &interned_strings, &aggregates);
1219 transport_contexts.push_back(ctx);
1220 }
1221
1222 // XXX: based on perf_event_mmap_header()
1223 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1224 perf_event_page_size = getpagesize();
1225 perf_event_mmap_size = perf_event_page_size * (perf_event_page_count + 1);
1226 for (unsigned cpu = 0; cpu < ncpus; cpu++)
1227 {
1228 if (!cpu_online[cpu]) // -- skip inactive CPUs.
1229 {
1230 perf_headers.push_back(nullptr);
1231 continue;
1232 }
1233
1234 int pmu_fd = perf_fds[cpu];
1235 void *base = mmap(NULL, perf_event_mmap_size,
1236 PROT_READ | PROT_WRITE, MAP_SHARED,
1237 pmu_fd, 0);
1238 if (base == MAP_FAILED)
1239 fatal("error mmapping header for perf_event fd %d\n", pmu_fd);
1240 perf_headers.push_back((perf_event_mmap_page*)base);
1241 if (log_level > 2)
1242 fprintf(stderr, "Initialized perf_event output on cpu %d\n", cpu);
1243 }
1244 }
1245
1246 static void
1247 load_bpf_file(const char *module)
1248 {
1249 module_name = module;
1250
1251 /* Extract basename: */
1252 char *buf = (char *)malloc(BPF_MAXSTRINGLEN * sizeof(char));
1253 string module_name_str(module);
1254 string module_basename_str
1255 = module_name_str.substr(module_name_str.rfind('/')+1); // basename
1256 size_t len = module_basename_str.copy(buf, BPF_MAXSTRINGLEN-1);
1257 buf[len] = '\0';
1258 module_basename = buf;
1259
1260 int fd = open(module, O_RDONLY);
1261 if (fd < 0)
1262 fatal_sys();
1263
1264 elf_version(EV_CURRENT);
1265
1266 Elf *elf = elf_begin(fd, ELF_C_READ_MMAP_PRIVATE, NULL);
1267 if (elf == NULL)
1268 fatal_elf();
1269 module_elf = elf;
1270
1271 Elf64_Ehdr *ehdr = elf64_getehdr(elf);
1272 if (ehdr == NULL)
1273 fatal_elf();
1274
1275 // Byte order should match the host, since we're loading locally.
1276 {
1277 const char *end_str;
1278 switch (ehdr->e_ident[EI_DATA])
1279 {
1280 case ELFDATA2MSB:
1281 if (__BYTE_ORDER == __BIG_ENDIAN)
1282 break;
1283 end_str = "MSB";
1284 goto err_endian;
1285 case ELFDATA2LSB:
1286 if (__BYTE_ORDER == __LITTLE_ENDIAN)
1287 break;
1288 end_str = "LSB";
1289 goto err_endian;
1290 case ELFCLASSNONE:
1291 end_str = "none";
1292 goto err_endian;
1293 default:
1294 end_str = "unknown";
1295 err_endian:
1296 fatal("incorrect byte ordering: %s\n", end_str);
1297 }
1298 }
1299
1300 // Tiny bit of sanity checking on the rest of the header. Since LLVM
1301 // began by producing files with EM_NONE, accept that too.
1302 if (ehdr->e_machine != EM_NONE && ehdr->e_machine != EM_BPF)
1303 fatal("incorrect machine type: %d\n", ehdr->e_machine);
1304
1305 unsigned shnum = ehdr->e_shnum;
1306 prog_fds.assign(shnum, -1);
1307
1308 std::vector<Elf64_Shdr *> shdrs(shnum, NULL);
1309 std::vector<Elf_Data *> sh_data(shnum, NULL);
1310 std::vector<const char *> sh_name(shnum, NULL);
1311 unsigned maps_idx = 0;
1312 unsigned version_idx = 0;
1313 unsigned license_idx = 0;
1314 unsigned script_name_idx = 0;
1315 unsigned interned_strings_idx = 0;
1316 unsigned aggregates_idx = 0;
1317 unsigned kprobes_idx = 0;
1318 unsigned begin_idx = 0;
1319 unsigned end_idx = 0;
1320
1321 // First pass to identify special sections, and make sure
1322 // all data is readable.
1323 for (unsigned i = 1; i < shnum; ++i)
1324 {
1325 Elf_Scn *scn = elf_getscn(elf, i);
1326 if (!scn)
1327 fatal_elf();
1328
1329 Elf64_Shdr *shdr = elf64_getshdr(scn);
1330 if (!shdr)
1331 fatal_elf();
1332
1333 const char *shname = elf_strptr(elf, ehdr->e_shstrndx, shdr->sh_name);
1334 if (!shname)
1335 fatal_elf();
1336
1337 // We need not consider any empty sections.
1338 if (shdr->sh_size == 0 || !*shname)
1339 continue;
1340
1341 Elf_Data *data = elf_getdata(scn, NULL);
1342 if (data == NULL)
1343 fatal_elf();
1344
1345 shdrs[i] = shdr;
1346 sh_name[i] = shname;
1347 sh_data[i] = data;
1348
1349 if (strcmp(shname, "license") == 0)
1350 license_idx = i;
1351 else if (strcmp(shname, "stapbpf_script_name") == 0)
1352 script_name_idx = i;
1353 else if (strcmp(shname, "stapbpf_interned_strings") == 0)
1354 interned_strings_idx = i;
1355 else if (strcmp(shname, "stapbpf_aggregates") == 0)
1356 aggregates_idx = i;
1357 else if (strcmp(shname, "version") == 0)
1358 version_idx = i;
1359 else if (strcmp(shname, "maps") == 0)
1360 maps_idx = i;
1361 else if (strcmp(shname, "kprobes") == 0)
1362 kprobes_idx = i;
1363 else if (strcmp(shname, "stap_begin") == 0)
1364 begin_idx = i;
1365 else if (strcmp(shname, "stap_end") == 0)
1366 end_idx = i;
1367 }
1368
1369 // Two special sections are not optional.
1370 if (license_idx != 0)
1371 module_license = static_cast<char *>(sh_data[license_idx]->d_buf);
1372 else
1373 fatal("missing license section\n");
1374 if (script_name_idx != 0)
1375 script_name = static_cast<char *>(sh_data[script_name_idx]->d_buf);
1376 else
1377 script_name = "<unknown>";
1378 if (version_idx != 0)
1379 {
1380 unsigned long long size = shdrs[version_idx]->sh_size;
1381 if (size != 4)
1382 fatal("invalid version size (%llu)\n", size);
1383 memcpy(&kernel_version, sh_data[version_idx]->d_buf, 4);
1384 }
1385 else
1386 fatal("missing version section\n");
1387
1388 // Create bpf maps as required.
1389 if (maps_idx != 0)
1390 instantiate_maps(shdrs[maps_idx], sh_data[maps_idx]);
1391
1392 // Create interned strings as required.
1393 if (interned_strings_idx != 0)
1394 {
1395 // XXX: Whatever the type used by the translator, this section
1396 // just holds a blob of NUL-terminated strings we parse as follows:
1397 char *strtab = static_cast<char *>(sh_data[interned_strings_idx]->d_buf);
1398 unsigned long long strtab_size = shdrs[interned_strings_idx]->sh_size;
1399 unsigned ofs = 0;
1400 bool found_hdr = false;
1401 while (ofs < strtab_size)
1402 {
1403 // XXX: Potentially vulnerable to NUL byte in string constant.
1404 std::string str(strtab+ofs); // XXX: will slurp up to NUL byte
1405 if (str.size() == 0 && !found_hdr)
1406 found_hdr = true; // section *may* start with an extra NUL byte
1407 else
1408 interned_strings.push_back(str);
1409 ofs += str.size() + 1;
1410 }
1411 }
1412
1413 // PR23476: Initialize table of statistical aggregates.
1414 if (aggregates_idx != 0)
1415 {
1416 uint64_t *aggtab = static_cast<uint64_t *>(sh_data[aggregates_idx]->d_buf);
1417 unsigned long long aggtab_size = shdrs[aggregates_idx]->sh_size;
1418 unsigned ofs = 0; unsigned i = 0;
1419 while (ofs < aggtab_size)
1420 {
1421 bpf::globals::agg_idx agg_id = (bpf::globals::agg_idx)aggtab[i];
1422 bpf::globals::interned_stats_map ism;
1423 for (unsigned j = 0; j < bpf::globals::stat_fields.size(); j++)
1424 {
1425 ism.push_back(aggtab[i+1+j]);
1426 }
1427 aggregates[agg_id] = bpf::globals::deintern_stats_map(ism);
1428 i += 1 + bpf::globals::stat_fields.size();
1429 ofs = sizeof(uint64_t) * i;
1430 }
1431 }
1432
1433 // Relocate all programs that require it.
1434 for (unsigned i = 1; i < shnum; ++i)
1435 {
1436 Elf64_Shdr *rel_hdr = shdrs[i];
1437 if (rel_hdr == NULL || rel_hdr->sh_type != SHT_REL)
1438 continue;
1439
1440 unsigned progi = rel_hdr->sh_info;
1441 if (progi == 0 || progi >= shnum)
1442 fatal("invalid section info %u->%u\n", i, progi);
1443 Elf64_Shdr *prog_hdr = shdrs[progi];
1444
1445 unsigned symi = rel_hdr->sh_link;
1446 if (symi == 0 || symi >= shnum)
1447 fatal("invalid section link %u->%u\n", i, symi);
1448 Elf64_Shdr *sym_hdr = shdrs[symi];
1449
1450 unsigned stri = sym_hdr->sh_link;
1451 if (stri == 0 || stri >= shnum)
1452 fatal("invalid section link %u->%u\n", symi, stri);
1453
1454 if (prog_hdr->sh_flags & SHF_EXECINSTR)
1455 prog_relocate(sh_data[progi], sh_data[i], sh_data[symi],
1456 sh_data[stri], sh_name[progi], maps_idx,
1457 prog_hdr->sh_flags & SHF_ALLOC);
1458 }
1459
1460 // Load all programs that require it.
1461 for (unsigned i = 1; i < shnum; ++i)
1462 {
1463 Elf64_Shdr *shdr = shdrs[i];
1464 if ((shdr->sh_flags & SHF_ALLOC) && (shdr->sh_flags & SHF_EXECINSTR))
1465 prog_fds[i] = prog_load(sh_data[i], sh_name[i]);
1466 }
1467
1468 // Remember begin and end probes.
1469 if (begin_idx)
1470 {
1471 Elf64_Shdr *shdr = shdrs[begin_idx];
1472 if (shdr->sh_flags & SHF_EXECINSTR)
1473 prog_begin = sh_data[begin_idx];
1474 }
1475 if (end_idx)
1476 {
1477 Elf64_Shdr *shdr = shdrs[end_idx];
1478 if (shdr->sh_flags & SHF_EXECINSTR)
1479 prog_end = sh_data[end_idx];
1480 }
1481
1482 // Record all kprobes.
1483 if (kprobes_idx != 0)
1484 {
1485 // The Preferred Systemtap Way puts kprobe strings into a symbol
1486 // table, so that multiple kprobes can reference the same program.
1487
1488 // ??? We don't really have to have a separate kprobe symbol table;
1489 // we could pull kprobes out of the main symbol table too. This
1490 // would probably make it easier for llvm-bpf folks to transition.
1491 // One would only need to create symbol aliases with custom asm names.
1492
1493 Elf64_Shdr *sym_hdr = shdrs[kprobes_idx];
1494 if (sym_hdr->sh_type != SHT_SYMTAB)
1495 fatal("invalid section type for kprobes section\n");
1496
1497 unsigned stri = sym_hdr->sh_link;
1498 if (stri == 0 || stri >= shnum)
1499 fatal("invalid section link %u->%u\n", kprobes_idx, stri);
1500
1501 kprobe_collect_from_syms(sh_data[kprobes_idx], sh_data[stri]);
1502 }
1503 else
1504 {
1505 // The original llvm-bpf way puts kprobe strings into the
1506 // section name. Each kprobe has its own program.
1507 for (unsigned i = 1; i < shnum; ++i)
1508 maybe_collect_kprobe(sh_name[i], i, i, 0);
1509 }
1510
1511 // Record all other probes
1512 for (unsigned i = 1; i < shnum; ++i) {
1513 if (strncmp(sh_name[i], "uprobe", 6) == 0)
1514 collect_uprobe(sh_name[i], i, i);
1515 if (strncmp(sh_name[i], "trace", 5) == 0)
1516 collect_tracepoint(sh_name[i], i, i);
1517 if (strncmp(sh_name[i], "perf", 4) == 0)
1518 collect_perf(sh_name[i], i, i);
1519 if (strncmp(sh_name[i], "timer", 5) == 0)
1520 collect_timer(sh_name[i], i, i);
1521 }
1522 }
1523
1524 static int
1525 get_exit_status()
1526 {
1527 int key = bpf::globals::EXIT;
1528 long val = 0;
1529
1530 if (bpf_lookup_elem
1531 (map_fds[bpf::globals::internal_map_idx], &key, &val) != 0)
1532 fatal("error during bpf map lookup: %s\n", strerror(errno));
1533
1534 return val;
1535 }
1536
1537 // XXX: based on perf_event_sample
1538 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1539 struct perf_event_sample {
1540 struct perf_event_header header;
1541 __u32 size;
1542 char data[];
1543 };
1544
1545 static enum bpf_perf_event_ret
1546 perf_event_handle(struct perf_event_header *hdr, void *private_data)
1547 {
1548 // XXX: based on bpf_perf_event_print
1549 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1550
1551 struct perf_event_sample *e = (struct perf_event_sample *)hdr;
1552 bpf_transport_context *ctx = (bpf_transport_context *)private_data;
1553 bpf_perf_event_ret ret;
1554
1555 // Make sure we weren't passed a userspace context by accident.
1556 assert(ctx->pmu_fd >= 0);
1557
1558 if (e->header.type == PERF_RECORD_SAMPLE)
1559 {
1560 __u32 actual_size = e->size - sizeof(e->size);
1561 ret = bpf_handle_transport_msg(e->data, actual_size, ctx);
1562 if (ret != LIBBPF_PERF_EVENT_CONT)
1563 return ret;
1564 }
1565 else if (e->header.type == PERF_RECORD_LOST)
1566 {
1567 struct lost_events {
1568 struct perf_event_header header;
1569 __u64 id;
1570 __u64 lost;
1571 };
1572 struct lost_events *lost = (lost_events *) e;
1573 fprintf(stderr, "WARNING: lost %lld perf_events on cpu %d\n",
1574 (long long)lost->lost, ctx->cpu);
1575 }
1576 else
1577 {
1578 fprintf(stderr, "WARNING: unknown perf_event type=%d size=%d on cpu %d\n",
1579 e->header.type, e->header.size, ctx->cpu);
1580 }
1581 return LIBBPF_PERF_EVENT_CONT;
1582 }
1583
1584 // PR22330: Listen for perf_events.
1585 static void
1586 perf_event_loop(pthread_t main_thread)
1587 {
1588 // XXX: based on perf_event_poller_multi()
1589 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1590
1591 enum bpf_perf_event_ret ret;
1592 void *data = NULL;
1593 size_t len = 0;
1594
1595 unsigned ncpus
1596 = map_attrs[bpf::globals::perf_event_map_idx].max_entries;
1597 unsigned n_active_cpus
1598 = count_active_cpus();
1599 struct pollfd *pmu_fds
1600 = (struct pollfd *)malloc(n_active_cpus * sizeof(struct pollfd));
1601 vector<unsigned> cpuids;
1602
1603 assert(ncpus == perf_fds.size());
1604 unsigned i = 0;
1605 for (unsigned cpu = 0; cpu < ncpus; cpu++)
1606 {
1607 if (!cpu_online[cpu]) continue; // -- skip inactive CPUs.
1608
1609 pmu_fds[i].fd = perf_fds[cpu];
1610 pmu_fds[i].events = POLLIN;
1611 cpuids.push_back(cpu);
1612 i++;
1613 }
1614 assert(n_active_cpus == cpuids.size());
1615
1616 // Avoid multiple warnings about errors reading from an fd:
1617 std::set<int> already_warned;
1618
1619 for (;;)
1620 {
1621 if (log_level > 3)
1622 fprintf(stderr, "Polling for perf_event data on %d cpus...\n", n_active_cpus);
1623 int ready = poll(pmu_fds, n_active_cpus, 1000); // XXX: Consider setting timeout -1 (unlimited).
1624 if (ready < 0 && errno == EINTR)
1625 goto signal_exit;
1626 if (ready < 0)
1627 fatal("Error checking for perf events: %s\n", strerror(errno));
1628 for (unsigned i = 0; i < n_active_cpus; i++)
1629 {
1630 if (pmu_fds[i].revents <= 0)
1631 continue;
1632 if (log_level > 3)
1633 fprintf(stderr, "Saw perf_event on fd %d\n", pmu_fds[i].fd);
1634
1635 ready --;
1636 unsigned cpu = cpuids[i];
1637 ret = bpf_perf_event_read_simple
1638 (perf_headers[cpu],
1639 perf_event_page_count * perf_event_page_size,
1640 perf_event_page_size,
1641 &data, &len,
1642 perf_event_handle, transport_contexts[cpu]);
1643
1644 if (ret == LIBBPF_PERF_EVENT_DONE)
1645 {
1646 // Saw STP_EXIT message. If the exit flag is set,
1647 // wake up main thread to begin program shutdown.
1648 if (get_exit_status())
1649 goto signal_exit;
1650 continue;
1651 }
1652 if (ret != LIBBPF_PERF_EVENT_CONT)
1653 if (already_warned.count(pmu_fds[i].fd) == 0)
1654 {
1655 fprintf(stderr, "WARNING: could not read from perf_event buffer on fd %d\n", pmu_fds[i].fd);
1656 already_warned.insert(pmu_fds[i].fd);
1657 }
1658 }
1659 assert(ready == 0);
1660 }
1661
1662 signal_exit:
1663 pthread_kill(main_thread, SIGINT);
1664 free(pmu_fds);
1665 return;
1666 }
1667
1668 static void
1669 usage(const char *argv0)
1670 {
1671 printf("Usage: %s [-v][-w][-V][-h] [-o FILE] <bpf-file>\n"
1672 " -h, --help Show this help text\n"
1673 " -v, --verbose Increase verbosity\n"
1674 " -V, --version Show version\n"
1675 " -w Suppress warnings\n"
1676 " -x pid Sets the '_stp_target' variable to pid.\n"
1677 " -o FILE Send output to FILE\n",
1678 argv0);
1679 }
1680
1681
1682 void
1683 sigint(int s)
1684 {
1685 // suppress any subsequent SIGINTs that may come from stap parent process
1686 signal(s, SIG_IGN);
1687
1688 // during the exit phase, ^C should exit immediately
1689 if (exit_phase)
1690 {
1691 if (!interrupt_message) // avoid duplicate message
1692 fprintf(stderr, "received interrupt during exit probe\n");
1693 interrupt_message = 1;
1694 abort();
1695 }
1696
1697 // set exit flag
1698 int key = bpf::globals::EXIT;
1699 long val = 1;
1700
1701 if (bpf_update_elem
1702 (map_fds[bpf::globals::internal_map_idx], &key, &val, 0) != 0)
1703 fatal("error during bpf map update: %s\n", strerror(errno));
1704 }
1705
1706 int
1707 main(int argc, char **argv)
1708 {
1709 static const option long_opts[] = {
1710 { "help", 0, NULL, 'h' },
1711 { "verbose", 0, NULL, 'v' },
1712 { "version", 0, NULL, 'V' },
1713 };
1714
1715 int rc;
1716
1717 while ((rc = getopt_long(argc, argv, "hvVwx:o:", long_opts, NULL)) >= 0)
1718 switch (rc)
1719 {
1720 case 'v':
1721 log_level++;
1722 break;
1723 case 'w':
1724 warnings = 0;
1725 break;
1726
1727 case 'x':
1728 target_pid = atoi(optarg);
1729 break;
1730
1731 case 'o':
1732 output_f = fopen(optarg, "w");
1733 if (output_f == NULL)
1734 {
1735 fprintf(stderr, "Error opening %s for output: %s\n",
1736 optarg, strerror(errno));
1737 return 1;
1738 }
1739 break;
1740
1741 case 'V':
1742 printf("Systemtap BPF loader/runner (version %s, %s)\n"
1743 "Copyright (C) 2016-2019 Red Hat, Inc. and others\n" // PRERELEASE
1744 "This is free software; "
1745 "see the source for copying conditions.\n",
1746 VERSION, STAP_EXTENDED_VERSION);
1747 return 0;
1748
1749 case 'h':
1750 usage(argv[0]);
1751 return 0;
1752
1753 default:
1754 do_usage:
1755 usage(argv[0]);
1756 return 1;
1757 }
1758 if (optind != argc - 1)
1759 goto do_usage;
1760
1761 // Be sure dmesg mentions that we are loading bpf programs:
1762 kmsg = fopen("/dev/kmsg", "w");
1763 if (kmsg == NULL)
1764 fprintf(stderr, "WARNING: could not open /dev/kmsg for diagnostics: %s\n", strerror(errno));
1765
1766 load_bpf_file(argv[optind]); // <- XXX initializes cpus online, PR24543 initializes default_cpu
1767 init_internal_globals();
1768 init_perf_transport();
1769
1770 // Create a bpf_transport_context for userspace programs:
1771 unsigned ncpus = map_attrs[bpf::globals::perf_event_map_idx].max_entries;
1772 bpf_transport_context uctx(default_cpu, -1/*pmu_fd*/, ncpus,
1773 map_attrs, &map_fds, output_f,
1774 &interned_strings, &aggregates);
1775
1776 if (create_group_fds() < 0)
1777 fatal("Error creating perf event group: %s\n", strerror(errno));
1778
1779 register_kprobes();
1780 register_uprobes();
1781 register_timers();
1782 register_tracepoints();
1783 register_perf();
1784
1785 // Run the begin probes.
1786 if (prog_begin)
1787 bpf_interpret(prog_begin->d_size / sizeof(bpf_insn),
1788 static_cast<bpf_insn *>(prog_begin->d_buf),
1789 &uctx);
1790
1791 // Wait for ^C; read BPF_OUTPUT events, copying them to output_f.
1792 signal(SIGINT, (sighandler_t)sigint);
1793 signal(SIGTERM, (sighandler_t)sigint);
1794
1795 // PR22330: Listen for perf_events:
1796 std::thread(perf_event_loop, pthread_self()).detach();
1797
1798 // Now that the begin probe has run and the perf_event listener is active, enable the kprobes.
1799 ioctl(group_fd, PERF_EVENT_IOC_ENABLE, 0);
1800
1801 // Wait for STP_EXIT message:
1802 while (!get_exit_status())
1803 pause();
1804
1805 // Disable the kprobes before deregistering and running exit probes.
1806 ioctl(group_fd, PERF_EVENT_IOC_DISABLE, 0);
1807 close(group_fd);
1808
1809 // Unregister all probes.
1810 unregister_kprobes(kprobes.size());
1811 unregister_uprobes(uprobes.size());
1812 unregister_timers(timers.size());
1813 unregister_perf(perf_probes.size());
1814 unregister_tracepoints(tracepoint_probes.size());
1815
1816 // We are now running exit probes, so ^C should exit immediately:
1817 exit_phase = 1;
1818 signal(SIGINT, (sighandler_t)sigint); // restore previously ignored signal
1819 signal(SIGTERM, (sighandler_t)sigint);
1820
1821 // Run the end+error probes.
1822 if (prog_end)
1823 bpf_interpret(prog_end->d_size / sizeof(bpf_insn),
1824 static_cast<bpf_insn *>(prog_end->d_buf),
1825 &uctx);
1826
1827 // Clean up transport layer allocations:
1828 for (std::vector<bpf_transport_context *>::iterator it = transport_contexts.begin();
1829 it != transport_contexts.end(); it++)
1830 delete *it;
1831
1832 elf_end(module_elf);
1833 fclose(kmsg);
1834 return 0;
1835 }
This page took 0.170919 seconds and 5 git commands to generate.