1 /* stapbpf.cxx - SystemTap BPF loader
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 * Copyright (C) 2016-2019 Red Hat, Inc.
36 #include <sys/fcntl.h>
37 #include <sys/ioctl.h>
38 #include <sys/syscall.h>
40 #include <sys/utsname.h>
41 #include <sys/resource.h>
42 #include "bpfinterp.h"
45 #include <linux/bpf.h>
46 #include <linux/perf_event.h>
47 /* Introduced in 4.1. */
48 #ifndef PERF_EVENT_IOC_SET_BPF
49 #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
55 #include "../git_version.h"
56 #include "../version.h"
57 #include "../bpf-internal.h"
63 #define R_BPF_MAP_FD 1
68 static int group_fd
= -1; // ??? Need one per cpu.
73 static int warnings
= 1;
74 static int exit_phase
= 0;
75 static int interrupt_message
= 0;
76 static FILE *output_f
= stdout
;
77 static FILE *kmsg
= NULL
;
79 static const char *module_name
;
80 static const char *module_basename
;
81 static const char *script_name
; // name of original systemtap script
82 static const char *module_license
;
83 static Elf
*module_elf
;
85 static uint32_t kernel_version
;
87 // Sized by the contents of the "maps" section.
88 static bpf_map_def
*map_attrs
;
89 static std::vector
<int> map_fds
;
91 // PR24543: Some perf constructs must be anchored to a single CPU.
92 // Normally we use cpu0, but it could (in very rare cases) be disabled.
93 // Initialized in mark_active_cpus() along with cpu_online.
94 static int default_cpu
= 0;
96 // Sized by the number of CPUs:
97 static std::vector
<int> perf_fds
;
98 static std::vector
<bool> cpu_online
; // -- is CPU active?
99 static std::vector
<struct perf_event_mmap_page
*> perf_headers
;
100 static std::vector
<bpf_transport_context
*> transport_contexts
;
102 // Additional info for perf_events transport:
103 static int perf_event_page_size
;
104 static int perf_event_page_count
= 8;
105 static int perf_event_mmap_size
;
107 // Table of interned strings:
108 static std::vector
<std::string
> interned_strings
;
110 // Table of map id's for statistical aggregates:
111 static std::unordered_map
<bpf::globals::agg_idx
, bpf::globals::stats_map
> aggregates
;
113 // XXX: Required static data and methods from bpf::globals, shared with translator.
114 #include "../bpf-shared-globals.h"
116 // Sized by the number of sections, so that we can easily
117 // look them up by st_shndx.
118 static std::vector
<int> prog_fds
;
120 // Programs to run at begin and end of execution.
121 static Elf_Data
*prog_begin
;
122 static Elf_Data
*prog_end
;
124 #define DEBUGFS "/sys/kernel/debug/tracing/"
125 #define KPROBE_EVENTS DEBUGFS "kprobe_events"
126 #define UPROBE_EVENTS DEBUGFS "uprobe_events"
127 #define EVENTS DEBUGFS "events"
129 #define CPUFS "/sys/devices/system/cpu/"
130 #define CPUS_ONLINE CPUFS "online"
131 #define CPUS_POSSIBLE CPUFS "possible"
133 static void unregister_kprobes(const size_t nprobes
);
141 int event_fd
; // ??? Need one per cpu.
143 kprobe_data(char t
, string s
, int fd
)
144 : args(s
), type(t
), prog_fd(fd
), event_id(-1), event_fd(-1)
153 unsigned long long offset
;
158 uprobe_data(string path
, char t
, int pid
, unsigned long long off
, int fd
)
159 : path(path
), type(t
), pid(pid
), offset(off
), prog_fd(fd
),
160 event_id(-1), event_fd(-1)
166 unsigned long period
;
170 timer_data(unsigned long period
, int fd
)
171 : period(period
), prog_fd(fd
), event_fd(-1)
180 unsigned long interval
;
184 perf_data(int type
, int config
, bool freq
, unsigned long interval
, int fd
)
185 : event_type(type
), event_config(config
), has_freq(freq
),
186 interval(interval
), prog_fd(fd
), event_fd(-1)
198 trace_data(char *s
, char *n
, int fd
)
199 : system(s
), name(n
), prog_fd(fd
), event_id(-1), event_fd(-1)
203 static std::vector
<kprobe_data
> kprobes
;
204 static std::vector
<timer_data
> timers
;
205 static std::vector
<perf_data
> perf_probes
;
206 static std::vector
<trace_data
> tracepoint_probes
;
207 static std::vector
<uprobe_data
> uprobes
;
209 // TODO: Move fatal() to bpfinterp.h and replace abort() calls in the interpreter.
210 // TODO: Add warn() option.
211 static void __attribute__((noreturn
))
212 fatal(const char *str
, ...)
215 fprintf(stderr
, "Error loading %s: ", module_name
);
219 vfprintf(stderr
, str
, va
);
228 fatal("%s\n", strerror(errno
));
234 fatal("%s\n", elf_errmsg(-1));
238 // XXX: based on get_online_cpus()/read_cpu_range()
239 // in bcc src/cc/common.cc
241 // PR24543: Also sets default_cpu.
243 // This is the only way I know of so far, so I have to imitate it for
244 // now. Parsing a /sys/devices diagnostic file seems a bit brittle to
247 mark_active_cpus(unsigned ncpus
)
249 std::ifstream
cpu_ranges(CPUS_ONLINE
);
250 std::string cpu_range
;
252 // XXX if cpu0 is offline
253 int alternate_cpu
= -1;
254 bool found_alternate
= false;
257 for (unsigned i
= 0; i
< ncpus
; i
++)
258 cpu_online
.push_back(false);
260 while (std::getline(cpu_ranges
, cpu_range
, ','))
262 size_t rangepos
= cpu_range
.find("-");
264 if (rangepos
== std::string::npos
)
266 start
= end
= std::stoi(cpu_range
);
270 start
= std::stoi(cpu_range
.substr(0, rangepos
));
271 end
= std::stoi(cpu_range
.substr(rangepos
+1));
273 for (int i
= start
; i
<= end
; i
++)
275 if (!found_alternate
)
278 found_alternate
= true;
280 cpu_online
[i
] = true;
284 // PR24543: Make sure default_cpu is active.
285 if (!cpu_online
[default_cpu
] && found_alternate
)
286 default_cpu
= alternate_cpu
;
293 for (unsigned cpu
= 0; cpu
< cpu_online
.size(); cpu
++)
302 perf_event_attr peattr
;
304 memset(&peattr
, 0, sizeof(peattr
));
305 peattr
.size
= sizeof(peattr
);
307 peattr
.type
= PERF_TYPE_SOFTWARE
;
308 peattr
.config
= PERF_COUNT_SW_DUMMY
;
310 return group_fd
= perf_event_open(&peattr
, -1, default_cpu
, -1, 0);
314 instantiate_maps (Elf64_Shdr
*shdr
, Elf_Data
*data
)
316 if (shdr
->sh_entsize
!= sizeof(bpf_map_def
))
317 fatal("map entry size mismatch (%zu != %zu)\n",
318 (size_t)shdr
->sh_entsize
, sizeof(bpf_map_def
));
320 size_t i
, n
= shdr
->sh_size
/ sizeof(bpf_map_def
);
321 struct bpf_map_def
*attrs
= static_cast<bpf_map_def
*>(data
->d_buf
);
324 map_fds
.assign(n
, -1);
326 // XXX: PR24324 -- This overhead space calculation was too
327 // conservative and caused resource exhaustion errors, disabling it
328 // until we figure out how much space we need or if the
329 // RLIM_INFINITY solution below is adequate.
331 /* First, make room for the maps in this process' RLIMIT_MEMLOCK: */
332 size_t rlimit_increase
= 0;
333 for (i
= 0; i
< n
; ++i
)
335 // TODO: The 58 bytes of overhead space per entry has been
336 // decided by trial and error, and may require further tweaking:
337 rlimit_increase
+= (58 + attrs
[i
].key_size
+ attrs
[i
].value_size
) * attrs
[i
].max_entries
;
338 // TODO: Note that Certain Other Tools just give up on
339 // calculating and set rlimit to the maximum possible.
343 struct rlimit curr_rlimit
;
346 rc
= getrlimit(RLIMIT_MEMLOCK
, &curr_rlimit
);
348 fatal("could not get map resource limit: %s\n",
351 rlim_t rlim_orig
= curr_rlimit
.rlim_cur
;
352 rlim_t rlim_max_orig
= curr_rlimit
.rlim_max
;
354 curr_rlimit
.rlim_cur
+= rlimit_increase
;
355 curr_rlimit
.rlim_max
+= rlimit_increase
;
356 if (curr_rlimit
.rlim_cur
< rlim_orig
) // handle overflow
357 curr_rlimit
.rlim_cur
= rlim_orig
;
358 if (curr_rlimit
.rlim_max
< rlim_max_orig
) // handle overflow
359 curr_rlimit
.rlim_max
= rlim_max_orig
;
361 // TODOXXX: PR24324 -- EXPERIMENTAL fix for aggressive resource limits.
362 // Other Tools do something like this but it doesn't solve all our problems.
363 curr_rlimit
.rlim_cur
= RLIM_INFINITY
;
364 curr_rlimit
.rlim_max
= RLIM_INFINITY
;
366 rc
= setrlimit(RLIMIT_MEMLOCK
, &curr_rlimit
);
368 fatal("could not increase map resource limit -- "
369 "cur from %lu to %lu, max from %lu to %lu: %s\n",
370 rlim_orig
, curr_rlimit
.rlim_cur
,
371 rlim_max_orig
, curr_rlimit
.rlim_max
,
375 fprintf(stderr
, "increasing map cur resource limit from %lu to %lu\n",
376 rlim_orig
, curr_rlimit
.rlim_cur
);
377 fprintf(stderr
, "increasing map max resource limit from %lu to %lu\n",
378 rlim_max_orig
, curr_rlimit
.rlim_max
);
381 /* Now create the maps: */
382 for (i
= 0; i
< n
; ++i
)
384 /* PR22330: The perf_event_map used for message transport must
385 have max_entries equal to the number of active CPUs, which we
386 wouldn't know for sure at translate time. Set it now: */
387 bpf_map_type map_type
= static_cast<bpf_map_type
>(attrs
[i
].type
);
388 if (map_type
== BPF_MAP_TYPE_PERF_EVENT_ARRAY
)
390 /* XXX: Assume our only perf_event_map is the percpu transport one: */
391 assert(i
== bpf::globals::perf_event_map_idx
);
392 assert(attrs
[i
].max_entries
== bpf::globals::NUM_CPUS_PLACEHOLDER
);
394 // TODO: perf_event buffers can only be created for currently
395 // active CPUs. For now we imitate Certain Other Tools and
396 // create perf_events for CPUs that are active at startup time
397 // (while sizing the perf_event_map according to total CPUs).
398 // But for full coverage, we really need to listen to CPUs
399 // coming on/offline and adjust accordingly.
400 long ncpus_
= sysconf(_SC_NPROCESSORS_CONF
);
401 unsigned ncpus
= ncpus_
> 0 ? ncpus_
: 1;
403 fprintf(stderr
, "WARNING: could not get number of CPUs, falling back to 1: %s\n", strerror(errno
));
404 else if (ncpus_
== 0)
405 fprintf(stderr
, "WARNING: could not get number of CPUs, falling back to 1\n"); // XXX no errno
406 //unsigned ncpus = get_nprocs_conf();
407 mark_active_cpus((unsigned)ncpus
);
408 attrs
[i
].max_entries
= ncpus
;
412 fprintf(stderr
, "creating map type %u entry %zu: key_size %u, value_size %u, "
413 "max_entries %u, map_flags %u\n", map_type
, i
,
414 attrs
[i
].key_size
, attrs
[i
].value_size
,
415 attrs
[i
].max_entries
, attrs
[i
].map_flags
);
416 int fd
= bpf_create_map(static_cast<bpf_map_type
>(attrs
[i
].type
),
417 attrs
[i
].key_size
, attrs
[i
].value_size
,
418 attrs
[i
].max_entries
, attrs
[i
].map_flags
);
420 fatal("map entry %zu: %s\n", i
, strerror(errno
));
426 prog_load(Elf_Data
*data
, const char *name
)
428 enum bpf_prog_type prog_type
;
430 if (strncmp(name
, "kprobe", 6) == 0)
431 prog_type
= BPF_PROG_TYPE_KPROBE
;
432 else if (strncmp(name
, "kretprobe", 9) == 0)
433 prog_type
= BPF_PROG_TYPE_KPROBE
;
434 else if (strncmp(name
, "uprobe", 6) == 0)
435 prog_type
= BPF_PROG_TYPE_KPROBE
;
436 else if (strncmp(name
, "timer", 5) == 0)
437 prog_type
= BPF_PROG_TYPE_PERF_EVENT
;
438 else if (strncmp(name
, "trace", 5) == 0)
439 prog_type
= BPF_PROG_TYPE_TRACEPOINT
;
440 else if (strncmp(name
, "perf", 4) == 0)
442 if (name
[5] == '2' && name
[6] == '/')
443 prog_type
= BPF_PROG_TYPE_TRACEPOINT
;
445 prog_type
= BPF_PROG_TYPE_PERF_EVENT
;
448 fatal("unhandled program type for section \"%s\"\n", name
);
450 if (data
->d_size
% sizeof(bpf_insn
))
451 fatal("program size not a multiple of %zu\n", sizeof(bpf_insn
));
455 fprintf (kmsg
, "%s (%s): stapbpf: %s, name: %s, d_size: %lu\n",
456 module_basename
, script_name
, VERSION
, name
, (unsigned long)data
->d_size
);
457 fflush (kmsg
); // Otherwise, flush will only happen after the prog runs.
459 int fd
= bpf_prog_load(prog_type
, static_cast<bpf_insn
*>(data
->d_buf
),
460 data
->d_size
, module_license
, kernel_version
);
463 if (bpf_log_buf
[0] != 0)
464 fatal("bpf program load failed: %s\n%s\n",
465 strerror(errno
), bpf_log_buf
);
467 fatal("bpf program load failed: %s\n", strerror(errno
));
473 prog_relocate(Elf_Data
*prog_data
, Elf_Data
*rel_data
,
474 Elf_Data
*sym_data
, Elf_Data
*str_data
,
475 const char *prog_name
, unsigned maps_idx
, bool allocated
)
477 bpf_insn
*insns
= static_cast<bpf_insn
*>(prog_data
->d_buf
);
478 Elf64_Rel
*rels
= static_cast<Elf64_Rel
*>(rel_data
->d_buf
);
479 Elf64_Sym
*syms
= static_cast<Elf64_Sym
*>(sym_data
->d_buf
);
481 if (prog_data
->d_size
% sizeof(bpf_insn
))
482 fatal("program size not a multiple of %zu\n", sizeof(bpf_insn
));
483 if (rel_data
->d_type
!= ELF_T_REL
484 || rel_data
->d_size
% sizeof(Elf64_Rel
))
485 fatal("invalid reloc metadata\n");
486 if (sym_data
->d_type
!= ELF_T_SYM
487 || sym_data
->d_size
% sizeof(Elf64_Sym
))
488 fatal("invalid symbol metadata\n");
490 size_t psize
= prog_data
->d_size
;
491 size_t nrels
= rel_data
->d_size
/ sizeof(Elf64_Rel
);
492 size_t nsyms
= sym_data
->d_size
/ sizeof(Elf64_Sym
);
494 for (size_t i
= 0; i
< nrels
; ++i
)
496 uint32_t sym
= ELF64_R_SYM(rels
[i
].r_info
);
497 uint32_t type
= ELF64_R_TYPE(rels
[i
].r_info
);
498 unsigned long long r_ofs
= rels
[i
].r_offset
;
501 if (type
!= R_BPF_MAP_FD
)
502 fatal("invalid relocation type %u\n", type
);
504 fatal("invalid symbol index %u\n", sym
);
505 if (r_ofs
>= psize
|| r_ofs
% sizeof(bpf_insn
))
506 fatal("invalid relocation offset at %s+%llu\n", prog_name
, r_ofs
);
509 fatal("invalid relocation symbol %u\n", sym
);
510 if (syms
[sym
].st_shndx
!= maps_idx
511 || syms
[sym
].st_value
% sizeof(bpf_map_def
)
512 || (fd_idx
= syms
[sym
].st_value
/ sizeof(bpf_map_def
),
513 fd_idx
>= map_fds
.size()))
515 const char *name
= "";
516 if (syms
[sym
].st_name
< str_data
->d_size
)
517 name
= static_cast<char *>(str_data
->d_buf
) + syms
[sym
].st_name
;
519 fatal("symbol %s does not reference a map\n", name
);
521 fatal("symbol %u does not reference a map\n", sym
);
524 bpf_insn
*insn
= insns
+ (r_ofs
/ sizeof(bpf_insn
));
525 if (insn
->code
!= (BPF_LD
| BPF_IMM
| BPF_DW
))
526 fatal("invalid relocation insn at %s+%llu\n", prog_name
, r_ofs
);
528 insn
->src_reg
= BPF_PSEUDO_MAP_FD
;
529 insn
->imm
= (allocated
? map_fds
[fd_idx
] : fd_idx
);
534 maybe_collect_kprobe(const char *name
, unsigned name_idx
,
535 unsigned fd_idx
, Elf64_Addr offset
)
540 if (strncmp(name
, "kprobe/", 7) == 0)
543 const char *stext
= NULL
;
547 ifstream
syms("/proc/kallsyms");
549 fatal("error opening /proc/kallsyms: %s\n", strerror(errno
));
551 // get value of symbol _stext and add it to the offset found in name.
552 while (getline(syms
, line
))
554 const char *l
= line
.c_str();
555 if (strncmp(l
+ 19, "_stext", 6) == 0)
563 fatal("could not find _stext in /proc/kallsyms");
565 unsigned long addr
= strtoul(stext
, NULL
, 16);
566 addr
+= strtoul(name
, NULL
, 16);
568 ss
<< "0x" << hex
<< addr
;
571 else if (strncmp(name
, "kretprobe/", 10) == 0)
572 type
= 'r', arg
= name
+ 10;
577 if (fd_idx
>= prog_fds
.size() || (fd
= prog_fds
[fd_idx
]) < 0)
578 fatal("probe %u section %u not loaded\n", name_idx
, fd_idx
);
580 fatal("probe %u offset non-zero\n", name_idx
);
582 kprobes
.push_back(kprobe_data(type
, arg
, fd
));
586 collect_uprobe(const char *name
, unsigned name_idx
, unsigned fd_idx
)
590 unsigned long long off
= 0;
593 int res
= sscanf(name
, "uprobe/%c/%d/%llu%s", &type
, &pid
, &off
, path
);
596 pid
= -1; // indicates to perf_event_open that we're tracing all processes
599 fatal("unable to parse name of probe %u section %u\n", name_idx
, fd_idx
);
602 if (fd_idx
>= prog_fds
.size() || (fd
= prog_fds
[fd_idx
]) < 0)
603 fatal("probe %u section %u not loaded\n", name_idx
, fd_idx
);
605 uprobes
.push_back(uprobe_data(std::string(path
), type
, pid
, off
, fd
));
609 collect_perf(const char *name
, unsigned name_idx
, unsigned fd_idx
)
614 unsigned long interval
;
616 int res
= sscanf(name
, "perf/%d/%d/%c/%lu",
617 &event_type
, &event_config
, &has_freq
, &interval
);
619 fatal("unable to parse name of probe %u section %u\n", name_idx
, fd_idx
);
622 if (fd_idx
>= prog_fds
.size() || (fd
= prog_fds
[fd_idx
]) < 0)
623 fatal("probe %u section %u not loaded\n", name_idx
, fd_idx
);
625 perf_probes
.push_back(
626 perf_data(event_type
, event_config
, has_freq
== 'f', interval
, fd
));
630 collect_timer(const char *name
, unsigned name_idx
, unsigned fd_idx
)
632 unsigned long period
= strtoul(name
+ 11, NULL
, 10);
634 if (strncmp(name
+ 6, "jiff/", 5) == 0)
636 long jiffies_per_sec
= sysconf(_SC_CLK_TCK
);
637 period
*= 1e9
/ jiffies_per_sec
;
641 if (fd_idx
>= prog_fds
.size() || (fd
= prog_fds
[fd_idx
]) < 0)
642 fatal("probe %u section %u not loaded\n", name_idx
, fd_idx
);
644 timers
.push_back(timer_data(period
, fd
));
649 collect_tracepoint(const char *name
, unsigned name_idx
, unsigned fd_idx
)
654 int res
= sscanf(name
, "trace/%[^/]/%s", tp_system
, tp_name
);
655 if (res
!= 2 || strlen(name
) > 512)
656 fatal("unable to parse name of probe %u section %u\n", name_idx
, fd_idx
);
659 if (fd_idx
>= prog_fds
.size() || (fd
= prog_fds
[fd_idx
]) < 0)
660 fatal("probe %u section %u not loaded\n", name_idx
, fd_idx
);
662 tracepoint_probes
.push_back(trace_data(tp_system
, tp_name
, fd
));
666 kprobe_collect_from_syms(Elf_Data
*sym_data
, Elf_Data
*str_data
)
668 Elf64_Sym
*syms
= static_cast<Elf64_Sym
*>(sym_data
->d_buf
);
669 size_t nsyms
= sym_data
->d_type
/ sizeof(Elf64_Sym
);
671 if (sym_data
->d_type
!= ELF_T_SYM
672 || sym_data
->d_size
% sizeof(Elf64_Sym
))
673 fatal("invalid kprobes symbol metadata\n");
675 for (size_t i
= 0; i
< nsyms
; ++i
)
678 if (syms
[i
].st_name
< str_data
->d_size
)
679 name
= static_cast<char *>(str_data
->d_buf
) + syms
[i
].st_name
;
681 fatal("symbol %u has invalid string index\n", i
);
682 maybe_collect_kprobe(name
, i
, syms
[i
].st_shndx
, syms
[i
].st_value
);
687 unregister_uprobes(const size_t nprobes
)
692 int fd
= open(DEBUGFS
"uprobe_events", O_WRONLY
);
697 const int pid
= getpid();
698 for (size_t i
= 0; i
< nprobes
; ++i
)
700 close(uprobes
[i
].event_fd
);
703 ssize_t olen
= snprintf(msgbuf
, sizeof(msgbuf
), "-:stapprobe_%d_%zu",
705 ssize_t wlen
= write(fd
, msgbuf
, olen
);
707 fprintf(stderr
, "Error removing probe %zu: %s\n",
716 size_t nprobes
= uprobes
.size();
720 int fd
= open(UPROBE_EVENTS
, O_WRONLY
);
722 fatal("Error opening %s: %s\n", UPROBE_EVENTS
, strerror(errno
));
724 const int pid
= getpid();
726 for (size_t i
= 0; i
< nprobes
; ++i
)
728 uprobe_data
&u
= uprobes
[i
];
729 char msgbuf
[PATH_MAX
];
731 ssize_t olen
= snprintf(msgbuf
, sizeof(msgbuf
), "%c:stapprobe_%d_%zu %s:0x%llx",
732 u
.type
, pid
, i
, u
.path
.c_str(), u
.offset
);
733 if ((size_t)olen
>= sizeof(msgbuf
))
735 fprintf(stderr
, "Buffer overflow creating probe %zu\n", i
);
743 fprintf(stderr
, "Associating probe %zu with uprobe %s\n", i
, msgbuf
);
745 ssize_t wlen
= write(fd
, msgbuf
, olen
);
748 fprintf(stderr
, "Error creating probe %zu: %s\n",
758 for (size_t i
= 0; i
< nprobes
; ++i
)
760 char fnbuf
[PATH_MAX
];
761 ssize_t len
= snprintf(fnbuf
, sizeof(fnbuf
),
762 DEBUGFS
"events/uprobes/stapprobe_%d_%zu/id", pid
, i
);
763 if ((size_t)len
>= sizeof(bpf_log_buf
))
765 fprintf(stderr
, "Buffer overflow creating probe %zu\n", i
);
769 fd
= open(fnbuf
, O_RDONLY
);
772 fprintf(stderr
, "Error opening probe event id %zu: %s\n",
778 len
= read(fd
, msgbuf
, sizeof(msgbuf
) - 1);
781 fprintf(stderr
, "Error reading probe event id %zu: %s\n",
788 uprobes
[i
].event_id
= atoi(msgbuf
);
791 // ??? Iterate to enable on all cpus, each with a different group_fd.
793 perf_event_attr peattr
;
795 memset(&peattr
, 0, sizeof(peattr
));
796 peattr
.size
= sizeof(peattr
);
797 peattr
.type
= PERF_TYPE_TRACEPOINT
;
798 peattr
.sample_type
= PERF_SAMPLE_RAW
;
799 peattr
.sample_period
= 1;
800 peattr
.wakeup_events
= 1;
802 for (size_t i
= 0; i
< nprobes
; ++i
)
804 uprobe_data
&u
= uprobes
[i
];
805 peattr
.config
= u
.event_id
;
807 fd
= perf_event_open(&peattr
, u
.pid
, default_cpu
, -1, 0);
810 fprintf(stderr
, "Error opening probe id %zu: %s\n",
816 if (ioctl(fd
, PERF_EVENT_IOC_SET_BPF
, u
.prog_fd
) < 0)
818 fprintf(stderr
, "Error installing bpf for probe id %zu: %s\n",
827 unregister_uprobes(nprobes
);
835 size_t nprobes
= kprobes
.size();
839 int fd
= open(KPROBE_EVENTS
, O_WRONLY
);
841 fatal("Error opening %s: %s\n", KPROBE_EVENTS
, strerror(errno
));
843 const int pid
= getpid();
845 for (size_t i
= 0; i
< nprobes
; ++i
)
847 kprobe_data
&k
= kprobes
[i
];
850 ssize_t olen
= snprintf(msgbuf
, sizeof(msgbuf
), "%c:p%d_%zu %s",
851 k
.type
, pid
, i
, k
.args
.c_str());
852 if ((size_t)olen
>= sizeof(msgbuf
))
854 fprintf(stderr
, "Buffer overflow creating probe %zu\n", i
);
862 fprintf(stderr
, "Associating probe %zu with kprobe %s\n", i
, msgbuf
);
864 ssize_t wlen
= write(fd
, msgbuf
, olen
);
867 fprintf(stderr
, "Error creating probe %zu: %s\n",
877 for (size_t i
= 0; i
< nprobes
; ++i
)
879 char fnbuf
[PATH_MAX
];
880 ssize_t len
= snprintf(fnbuf
, sizeof(fnbuf
),
881 DEBUGFS
"events/kprobes/p%d_%zu/id", pid
, i
);
882 if ((size_t)len
>= sizeof(bpf_log_buf
))
884 fprintf(stderr
, "Buffer overflow creating probe %zu\n", i
);
888 fd
= open(fnbuf
, O_RDONLY
);
891 fprintf(stderr
, "Error opening probe event id %zu: %s\n",
897 len
= read(fd
, msgbuf
, sizeof(msgbuf
) - 1);
900 fprintf(stderr
, "Error reading probe event id %zu: %s\n",
907 kprobes
[i
].event_id
= atoi(msgbuf
);
910 // ??? Iterate to enable on all cpus, each with a different group_fd.
912 perf_event_attr peattr
;
914 memset(&peattr
, 0, sizeof(peattr
));
915 peattr
.size
= sizeof(peattr
);
916 peattr
.type
= PERF_TYPE_TRACEPOINT
;
917 peattr
.sample_type
= PERF_SAMPLE_RAW
;
918 peattr
.sample_period
= 1;
919 peattr
.wakeup_events
= 1;
921 for (size_t i
= 0; i
< nprobes
; ++i
)
923 kprobe_data
&k
= kprobes
[i
];
924 peattr
.config
= k
.event_id
;
926 fd
= perf_event_open(&peattr
, -1, default_cpu
, group_fd
, 0);
929 fprintf(stderr
, "Error opening probe id %zu: %s\n",
935 if (ioctl(fd
, PERF_EVENT_IOC_SET_BPF
, k
.prog_fd
) < 0)
937 fprintf(stderr
, "Error installing bpf for probe id %zu: %s\n",
946 unregister_kprobes(nprobes
);
952 unregister_kprobes(const size_t nprobes
)
957 int fd
= open(DEBUGFS
"kprobe_events", O_WRONLY
);
962 const int pid
= getpid();
963 for (size_t i
= 0; i
< nprobes
; ++i
)
965 close(kprobes
[i
].event_fd
);
968 ssize_t olen
= snprintf(msgbuf
, sizeof(msgbuf
), "-:p%d_%zu",
970 ssize_t wlen
= write(fd
, msgbuf
, olen
);
972 fprintf(stderr
, "Error removing probe %zu: %s\n",
979 unregister_tracepoints(const size_t nprobes
)
981 for (size_t i
= 0; i
< nprobes
; ++i
)
982 close(tracepoint_probes
[i
].event_fd
);
986 register_tracepoints()
988 size_t nprobes
= tracepoint_probes
.size();
992 for (size_t i
= 0; i
< nprobes
; ++i
)
994 trace_data
&t
= tracepoint_probes
[i
];
995 char fnbuf
[PATH_MAX
];
996 ssize_t len
= snprintf(fnbuf
, sizeof(fnbuf
),
997 DEBUGFS
"events/%s/%s/id",
998 t
.system
.c_str(), t
.name
.c_str());
999 if ((size_t)len
>= sizeof(bpf_log_buf
))
1001 fprintf(stderr
, "Buffer overflow creating probe %zu\n", i
);
1005 int fd
= open(fnbuf
, O_RDONLY
);
1008 fprintf(stderr
, "Error opening probe event id %zu: %s\n",
1009 i
, strerror(errno
));
1011 if (errno
== ENOENT
)
1012 fprintf(stderr
, "\"%s/%s\" could not be found in %s\n",
1013 t
.system
.c_str(), t
.name
.c_str(), EVENTS
);
1019 len
= read(fd
, msgbuf
, sizeof(msgbuf
) - 1);
1022 fprintf(stderr
, "Error reading probe event id %zu: %s\n",
1023 i
, strerror(errno
));
1030 t
.event_id
= atoi(msgbuf
);
1033 // ??? Iterate to enable on all cpus, each with a different group_fd.
1035 perf_event_attr peattr
;
1037 memset(&peattr
, 0, sizeof(peattr
));
1038 peattr
.size
= sizeof(peattr
);
1039 peattr
.type
= PERF_TYPE_TRACEPOINT
;
1040 peattr
.sample_type
= PERF_SAMPLE_RAW
;
1041 peattr
.sample_period
= 1;
1042 peattr
.wakeup_events
= 1;
1044 for (size_t i
= 0; i
< nprobes
; ++i
)
1046 trace_data
&t
= tracepoint_probes
[i
];
1047 peattr
.config
= t
.event_id
;
1049 int fd
= perf_event_open(&peattr
, -1, default_cpu
, group_fd
, 0);
1052 fprintf(stderr
, "Error opening probe id %zu: %s\n",
1053 i
, strerror(errno
));
1058 if (ioctl(fd
, PERF_EVENT_IOC_SET_BPF
, t
.prog_fd
) < 0)
1060 fprintf(stderr
, "Error installing bpf for probe id %zu: %s\n",
1061 i
, strerror(errno
));
1069 unregister_tracepoints(nprobes
);
1074 unregister_timers(const size_t nprobes
)
1076 for (size_t i
= 0; i
< nprobes
; ++i
)
1077 close(timers
[i
].event_fd
);
1083 perf_event_attr peattr
;
1085 memset(&peattr
, 0, sizeof(peattr
));
1086 peattr
.size
= sizeof(peattr
);
1087 peattr
.type
= PERF_TYPE_SOFTWARE
;
1088 peattr
.config
= PERF_COUNT_SW_CPU_CLOCK
;
1090 for (size_t i
= 0; i
< timers
.size(); ++i
)
1092 timer_data
&t
= timers
[i
];
1093 peattr
.sample_period
= t
.period
;
1095 int fd
= perf_event_open(&peattr
, -1, default_cpu
, group_fd
, 0);
1099 unregister_timers(timers
.size());
1100 fatal("Error opening timer probe id %zu: %s\n", i
+ 1, strerror(err
));
1104 if (ioctl(fd
, PERF_EVENT_IOC_SET_BPF
, t
.prog_fd
) < 0)
1107 unregister_timers(timers
.size());
1108 fatal("Error installing bpf for timer probe id %zu: %s\n",
1109 i
+ 1, strerror(err
));
1117 unregister_perf(const size_t nprobes
)
1119 for (size_t i
= 0; i
< nprobes
; ++i
)
1120 close(perf_probes
[i
].event_fd
);
1126 for (size_t i
= 0; i
< perf_probes
.size(); ++i
)
1128 perf_data
&p
= perf_probes
[i
];
1129 perf_event_attr peattr
;
1131 memset(&peattr
, 0, sizeof(peattr
));
1132 peattr
.size
= sizeof(peattr
);
1133 peattr
.type
= p
.event_type
;
1134 peattr
.config
= p
.event_config
;
1139 peattr
.sample_freq
= p
.interval
;
1142 peattr
.sample_period
= p
.interval
;
1144 // group_fd is not used since this event might have an
1145 // incompatible type/config.
1146 int fd
= perf_event_open(&peattr
, -1, default_cpu
, -1, 0);
1150 unregister_perf(perf_probes
.size());
1151 fatal("Error opening perf probe id %zu: %s\n", i
+ 1, strerror(err
));
1155 if (ioctl(fd
, PERF_EVENT_IOC_SET_BPF
, p
.prog_fd
) < 0)
1158 unregister_perf(perf_probes
.size());
1159 fatal("Error installing bpf for perf probe id %zu: %s\n",
1160 i
+ 1, strerror(err
));
1166 init_internal_globals()
1168 using namespace bpf
;
1170 int key
= globals::EXIT
;
1173 if (bpf_update_elem(map_fds
[globals::internal_map_idx
],
1174 (void*)&key
, (void*)&val
, BPF_ANY
) != 0)
1175 fatal("Error updating pid: %s\n", strerror(errno
));
1179 // PR22330: Initialize perf_event_map and perf_fds.
1181 init_perf_transport()
1183 using namespace bpf
;
1185 unsigned ncpus
= map_attrs
[globals::perf_event_map_idx
].max_entries
;
1187 for (unsigned cpu
= 0; cpu
< ncpus
; cpu
++)
1189 if (!cpu_online
[cpu
]) // -- skip inactive CPUs.
1191 perf_fds
.push_back(-1);
1192 transport_contexts
.push_back(nullptr);
1196 struct perf_event_attr peattr
;
1198 memset(&peattr
, 0, sizeof(peattr
));
1199 peattr
.size
= sizeof(peattr
);
1200 peattr
.sample_type
= PERF_SAMPLE_RAW
;
1201 peattr
.type
= PERF_TYPE_SOFTWARE
;
1202 peattr
.config
= PERF_COUNT_SW_BPF_OUTPUT
;
1203 peattr
.sample_period
= 1;
1204 peattr
.wakeup_events
= 1;
1206 int pmu_fd
= perf_event_open(&peattr
, -1/*pid*/, cpu
, -1/*group_fd*/, 0);
1208 fatal("Error initializing perf event for cpu %d: %s\n", cpu
, strerror(errno
));
1209 if (bpf_update_elem(map_fds
[globals::perf_event_map_idx
],
1210 (void*)&cpu
, (void*)&pmu_fd
, BPF_ANY
) != 0)
1211 fatal("Error assigning perf event for cpu %d: %s\n", cpu
, strerror(errno
));
1212 ioctl(pmu_fd
, PERF_EVENT_IOC_ENABLE
, 0);
1213 perf_fds
.push_back(pmu_fd
);
1215 // Create a data structure to track what's happening on each CPU:
1216 bpf_transport_context
*ctx
1217 = new bpf_transport_context(cpu
, pmu_fd
, ncpus
, map_attrs
, &map_fds
,
1218 output_f
, &interned_strings
, &aggregates
);
1219 transport_contexts
.push_back(ctx
);
1222 // XXX: based on perf_event_mmap_header()
1223 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1224 perf_event_page_size
= getpagesize();
1225 perf_event_mmap_size
= perf_event_page_size
* (perf_event_page_count
+ 1);
1226 for (unsigned cpu
= 0; cpu
< ncpus
; cpu
++)
1228 if (!cpu_online
[cpu
]) // -- skip inactive CPUs.
1230 perf_headers
.push_back(nullptr);
1234 int pmu_fd
= perf_fds
[cpu
];
1235 void *base
= mmap(NULL
, perf_event_mmap_size
,
1236 PROT_READ
| PROT_WRITE
, MAP_SHARED
,
1238 if (base
== MAP_FAILED
)
1239 fatal("error mmapping header for perf_event fd %d\n", pmu_fd
);
1240 perf_headers
.push_back((perf_event_mmap_page
*)base
);
1242 fprintf(stderr
, "Initialized perf_event output on cpu %d\n", cpu
);
1247 load_bpf_file(const char *module
)
1249 module_name
= module
;
1251 /* Extract basename: */
1252 char *buf
= (char *)malloc(BPF_MAXSTRINGLEN
* sizeof(char));
1253 string
module_name_str(module
);
1254 string module_basename_str
1255 = module_name_str
.substr(module_name_str
.rfind('/')+1); // basename
1256 size_t len
= module_basename_str
.copy(buf
, BPF_MAXSTRINGLEN
-1);
1258 module_basename
= buf
;
1260 int fd
= open(module
, O_RDONLY
);
1264 elf_version(EV_CURRENT
);
1266 Elf
*elf
= elf_begin(fd
, ELF_C_READ_MMAP_PRIVATE
, NULL
);
1271 Elf64_Ehdr
*ehdr
= elf64_getehdr(elf
);
1275 // Byte order should match the host, since we're loading locally.
1277 const char *end_str
;
1278 switch (ehdr
->e_ident
[EI_DATA
])
1281 if (__BYTE_ORDER
== __BIG_ENDIAN
)
1286 if (__BYTE_ORDER
== __LITTLE_ENDIAN
)
1294 end_str
= "unknown";
1296 fatal("incorrect byte ordering: %s\n", end_str
);
1300 // Tiny bit of sanity checking on the rest of the header. Since LLVM
1301 // began by producing files with EM_NONE, accept that too.
1302 if (ehdr
->e_machine
!= EM_NONE
&& ehdr
->e_machine
!= EM_BPF
)
1303 fatal("incorrect machine type: %d\n", ehdr
->e_machine
);
1305 unsigned shnum
= ehdr
->e_shnum
;
1306 prog_fds
.assign(shnum
, -1);
1308 std::vector
<Elf64_Shdr
*> shdrs(shnum
, NULL
);
1309 std::vector
<Elf_Data
*> sh_data(shnum
, NULL
);
1310 std::vector
<const char *> sh_name(shnum
, NULL
);
1311 unsigned maps_idx
= 0;
1312 unsigned version_idx
= 0;
1313 unsigned license_idx
= 0;
1314 unsigned script_name_idx
= 0;
1315 unsigned interned_strings_idx
= 0;
1316 unsigned aggregates_idx
= 0;
1317 unsigned kprobes_idx
= 0;
1318 unsigned begin_idx
= 0;
1319 unsigned end_idx
= 0;
1321 // First pass to identify special sections, and make sure
1322 // all data is readable.
1323 for (unsigned i
= 1; i
< shnum
; ++i
)
1325 Elf_Scn
*scn
= elf_getscn(elf
, i
);
1329 Elf64_Shdr
*shdr
= elf64_getshdr(scn
);
1333 const char *shname
= elf_strptr(elf
, ehdr
->e_shstrndx
, shdr
->sh_name
);
1337 // We need not consider any empty sections.
1338 if (shdr
->sh_size
== 0 || !*shname
)
1341 Elf_Data
*data
= elf_getdata(scn
, NULL
);
1346 sh_name
[i
] = shname
;
1349 if (strcmp(shname
, "license") == 0)
1351 else if (strcmp(shname
, "stapbpf_script_name") == 0)
1352 script_name_idx
= i
;
1353 else if (strcmp(shname
, "stapbpf_interned_strings") == 0)
1354 interned_strings_idx
= i
;
1355 else if (strcmp(shname
, "stapbpf_aggregates") == 0)
1357 else if (strcmp(shname
, "version") == 0)
1359 else if (strcmp(shname
, "maps") == 0)
1361 else if (strcmp(shname
, "kprobes") == 0)
1363 else if (strcmp(shname
, "stap_begin") == 0)
1365 else if (strcmp(shname
, "stap_end") == 0)
1369 // Two special sections are not optional.
1370 if (license_idx
!= 0)
1371 module_license
= static_cast<char *>(sh_data
[license_idx
]->d_buf
);
1373 fatal("missing license section\n");
1374 if (script_name_idx
!= 0)
1375 script_name
= static_cast<char *>(sh_data
[script_name_idx
]->d_buf
);
1377 script_name
= "<unknown>";
1378 if (version_idx
!= 0)
1380 unsigned long long size
= shdrs
[version_idx
]->sh_size
;
1382 fatal("invalid version size (%llu)\n", size
);
1383 memcpy(&kernel_version
, sh_data
[version_idx
]->d_buf
, 4);
1386 fatal("missing version section\n");
1388 // Create bpf maps as required.
1390 instantiate_maps(shdrs
[maps_idx
], sh_data
[maps_idx
]);
1392 // Create interned strings as required.
1393 if (interned_strings_idx
!= 0)
1395 // XXX: Whatever the type used by the translator, this section
1396 // just holds a blob of NUL-terminated strings we parse as follows:
1397 char *strtab
= static_cast<char *>(sh_data
[interned_strings_idx
]->d_buf
);
1398 unsigned long long strtab_size
= shdrs
[interned_strings_idx
]->sh_size
;
1400 bool found_hdr
= false;
1401 while (ofs
< strtab_size
)
1403 // XXX: Potentially vulnerable to NUL byte in string constant.
1404 std::string
str(strtab
+ofs
); // XXX: will slurp up to NUL byte
1405 if (str
.size() == 0 && !found_hdr
)
1406 found_hdr
= true; // section *may* start with an extra NUL byte
1408 interned_strings
.push_back(str
);
1409 ofs
+= str
.size() + 1;
1413 // PR23476: Initialize table of statistical aggregates.
1414 if (aggregates_idx
!= 0)
1416 uint64_t *aggtab
= static_cast<uint64_t *>(sh_data
[aggregates_idx
]->d_buf
);
1417 unsigned long long aggtab_size
= shdrs
[aggregates_idx
]->sh_size
;
1418 unsigned ofs
= 0; unsigned i
= 0;
1419 while (ofs
< aggtab_size
)
1421 bpf::globals::agg_idx agg_id
= (bpf::globals::agg_idx
)aggtab
[i
];
1422 bpf::globals::interned_stats_map ism
;
1423 for (unsigned j
= 0; j
< bpf::globals::stat_fields
.size(); j
++)
1425 ism
.push_back(aggtab
[i
+1+j
]);
1427 aggregates
[agg_id
] = bpf::globals::deintern_stats_map(ism
);
1428 i
+= 1 + bpf::globals::stat_fields
.size();
1429 ofs
= sizeof(uint64_t) * i
;
1433 // Relocate all programs that require it.
1434 for (unsigned i
= 1; i
< shnum
; ++i
)
1436 Elf64_Shdr
*rel_hdr
= shdrs
[i
];
1437 if (rel_hdr
== NULL
|| rel_hdr
->sh_type
!= SHT_REL
)
1440 unsigned progi
= rel_hdr
->sh_info
;
1441 if (progi
== 0 || progi
>= shnum
)
1442 fatal("invalid section info %u->%u\n", i
, progi
);
1443 Elf64_Shdr
*prog_hdr
= shdrs
[progi
];
1445 unsigned symi
= rel_hdr
->sh_link
;
1446 if (symi
== 0 || symi
>= shnum
)
1447 fatal("invalid section link %u->%u\n", i
, symi
);
1448 Elf64_Shdr
*sym_hdr
= shdrs
[symi
];
1450 unsigned stri
= sym_hdr
->sh_link
;
1451 if (stri
== 0 || stri
>= shnum
)
1452 fatal("invalid section link %u->%u\n", symi
, stri
);
1454 if (prog_hdr
->sh_flags
& SHF_EXECINSTR
)
1455 prog_relocate(sh_data
[progi
], sh_data
[i
], sh_data
[symi
],
1456 sh_data
[stri
], sh_name
[progi
], maps_idx
,
1457 prog_hdr
->sh_flags
& SHF_ALLOC
);
1460 // Load all programs that require it.
1461 for (unsigned i
= 1; i
< shnum
; ++i
)
1463 Elf64_Shdr
*shdr
= shdrs
[i
];
1464 if ((shdr
->sh_flags
& SHF_ALLOC
) && (shdr
->sh_flags
& SHF_EXECINSTR
))
1465 prog_fds
[i
] = prog_load(sh_data
[i
], sh_name
[i
]);
1468 // Remember begin and end probes.
1471 Elf64_Shdr
*shdr
= shdrs
[begin_idx
];
1472 if (shdr
->sh_flags
& SHF_EXECINSTR
)
1473 prog_begin
= sh_data
[begin_idx
];
1477 Elf64_Shdr
*shdr
= shdrs
[end_idx
];
1478 if (shdr
->sh_flags
& SHF_EXECINSTR
)
1479 prog_end
= sh_data
[end_idx
];
1482 // Record all kprobes.
1483 if (kprobes_idx
!= 0)
1485 // The Preferred Systemtap Way puts kprobe strings into a symbol
1486 // table, so that multiple kprobes can reference the same program.
1488 // ??? We don't really have to have a separate kprobe symbol table;
1489 // we could pull kprobes out of the main symbol table too. This
1490 // would probably make it easier for llvm-bpf folks to transition.
1491 // One would only need to create symbol aliases with custom asm names.
1493 Elf64_Shdr
*sym_hdr
= shdrs
[kprobes_idx
];
1494 if (sym_hdr
->sh_type
!= SHT_SYMTAB
)
1495 fatal("invalid section type for kprobes section\n");
1497 unsigned stri
= sym_hdr
->sh_link
;
1498 if (stri
== 0 || stri
>= shnum
)
1499 fatal("invalid section link %u->%u\n", kprobes_idx
, stri
);
1501 kprobe_collect_from_syms(sh_data
[kprobes_idx
], sh_data
[stri
]);
1505 // The original llvm-bpf way puts kprobe strings into the
1506 // section name. Each kprobe has its own program.
1507 for (unsigned i
= 1; i
< shnum
; ++i
)
1508 maybe_collect_kprobe(sh_name
[i
], i
, i
, 0);
1511 // Record all other probes
1512 for (unsigned i
= 1; i
< shnum
; ++i
) {
1513 if (strncmp(sh_name
[i
], "uprobe", 6) == 0)
1514 collect_uprobe(sh_name
[i
], i
, i
);
1515 if (strncmp(sh_name
[i
], "trace", 5) == 0)
1516 collect_tracepoint(sh_name
[i
], i
, i
);
1517 if (strncmp(sh_name
[i
], "perf", 4) == 0)
1518 collect_perf(sh_name
[i
], i
, i
);
1519 if (strncmp(sh_name
[i
], "timer", 5) == 0)
1520 collect_timer(sh_name
[i
], i
, i
);
1527 int key
= bpf::globals::EXIT
;
1531 (map_fds
[bpf::globals::internal_map_idx
], &key
, &val
) != 0)
1532 fatal("error during bpf map lookup: %s\n", strerror(errno
));
1537 // XXX: based on perf_event_sample
1538 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1539 struct perf_event_sample
{
1540 struct perf_event_header header
;
1545 static enum bpf_perf_event_ret
1546 perf_event_handle(struct perf_event_header
*hdr
, void *private_data
)
1548 // XXX: based on bpf_perf_event_print
1549 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1551 struct perf_event_sample
*e
= (struct perf_event_sample
*)hdr
;
1552 bpf_transport_context
*ctx
= (bpf_transport_context
*)private_data
;
1553 bpf_perf_event_ret ret
;
1555 // Make sure we weren't passed a userspace context by accident.
1556 assert(ctx
->pmu_fd
>= 0);
1558 if (e
->header
.type
== PERF_RECORD_SAMPLE
)
1560 __u32 actual_size
= e
->size
- sizeof(e
->size
);
1561 ret
= bpf_handle_transport_msg(e
->data
, actual_size
, ctx
);
1562 if (ret
!= LIBBPF_PERF_EVENT_CONT
)
1565 else if (e
->header
.type
== PERF_RECORD_LOST
)
1567 struct lost_events
{
1568 struct perf_event_header header
;
1572 struct lost_events
*lost
= (lost_events
*) e
;
1573 fprintf(stderr
, "WARNING: lost %lld perf_events on cpu %d\n",
1574 (long long)lost
->lost
, ctx
->cpu
);
1578 fprintf(stderr
, "WARNING: unknown perf_event type=%d size=%d on cpu %d\n",
1579 e
->header
.type
, e
->header
.size
, ctx
->cpu
);
1581 return LIBBPF_PERF_EVENT_CONT
;
1584 // PR22330: Listen for perf_events.
1586 perf_event_loop(pthread_t main_thread
)
1588 // XXX: based on perf_event_poller_multi()
1589 // in kernel tools/testing/selftests/bpf/trace_helpers.c
1591 enum bpf_perf_event_ret ret
;
1596 = map_attrs
[bpf::globals::perf_event_map_idx
].max_entries
;
1597 unsigned n_active_cpus
1598 = count_active_cpus();
1599 struct pollfd
*pmu_fds
1600 = (struct pollfd
*)malloc(n_active_cpus
* sizeof(struct pollfd
));
1601 vector
<unsigned> cpuids
;
1603 assert(ncpus
== perf_fds
.size());
1605 for (unsigned cpu
= 0; cpu
< ncpus
; cpu
++)
1607 if (!cpu_online
[cpu
]) continue; // -- skip inactive CPUs.
1609 pmu_fds
[i
].fd
= perf_fds
[cpu
];
1610 pmu_fds
[i
].events
= POLLIN
;
1611 cpuids
.push_back(cpu
);
1614 assert(n_active_cpus
== cpuids
.size());
1616 // Avoid multiple warnings about errors reading from an fd:
1617 std::set
<int> already_warned
;
1622 fprintf(stderr
, "Polling for perf_event data on %d cpus...\n", n_active_cpus
);
1623 int ready
= poll(pmu_fds
, n_active_cpus
, 1000); // XXX: Consider setting timeout -1 (unlimited).
1624 if (ready
< 0 && errno
== EINTR
)
1627 fatal("Error checking for perf events: %s\n", strerror(errno
));
1628 for (unsigned i
= 0; i
< n_active_cpus
; i
++)
1630 if (pmu_fds
[i
].revents
<= 0)
1633 fprintf(stderr
, "Saw perf_event on fd %d\n", pmu_fds
[i
].fd
);
1636 unsigned cpu
= cpuids
[i
];
1637 ret
= bpf_perf_event_read_simple
1639 perf_event_page_count
* perf_event_page_size
,
1640 perf_event_page_size
,
1642 perf_event_handle
, transport_contexts
[cpu
]);
1644 if (ret
== LIBBPF_PERF_EVENT_DONE
)
1646 // Saw STP_EXIT message. If the exit flag is set,
1647 // wake up main thread to begin program shutdown.
1648 if (get_exit_status())
1652 if (ret
!= LIBBPF_PERF_EVENT_CONT
)
1653 if (already_warned
.count(pmu_fds
[i
].fd
) == 0)
1655 fprintf(stderr
, "WARNING: could not read from perf_event buffer on fd %d\n", pmu_fds
[i
].fd
);
1656 already_warned
.insert(pmu_fds
[i
].fd
);
1663 pthread_kill(main_thread
, SIGINT
);
1669 usage(const char *argv0
)
1671 printf("Usage: %s [-v][-w][-V][-h] [-o FILE] <bpf-file>\n"
1672 " -h, --help Show this help text\n"
1673 " -v, --verbose Increase verbosity\n"
1674 " -V, --version Show version\n"
1675 " -w Suppress warnings\n"
1676 " -x pid Sets the '_stp_target' variable to pid.\n"
1677 " -o FILE Send output to FILE\n",
1685 // suppress any subsequent SIGINTs that may come from stap parent process
1688 // during the exit phase, ^C should exit immediately
1691 if (!interrupt_message
) // avoid duplicate message
1692 fprintf(stderr
, "received interrupt during exit probe\n");
1693 interrupt_message
= 1;
1698 int key
= bpf::globals::EXIT
;
1702 (map_fds
[bpf::globals::internal_map_idx
], &key
, &val
, 0) != 0)
1703 fatal("error during bpf map update: %s\n", strerror(errno
));
1707 main(int argc
, char **argv
)
1709 static const option long_opts
[] = {
1710 { "help", 0, NULL
, 'h' },
1711 { "verbose", 0, NULL
, 'v' },
1712 { "version", 0, NULL
, 'V' },
1717 while ((rc
= getopt_long(argc
, argv
, "hvVwx:o:", long_opts
, NULL
)) >= 0)
1728 target_pid
= atoi(optarg
);
1732 output_f
= fopen(optarg
, "w");
1733 if (output_f
== NULL
)
1735 fprintf(stderr
, "Error opening %s for output: %s\n",
1736 optarg
, strerror(errno
));
1742 printf("Systemtap BPF loader/runner (version %s, %s)\n"
1743 "Copyright (C) 2016-2019 Red Hat, Inc. and others\n" // PRERELEASE
1744 "This is free software; "
1745 "see the source for copying conditions.\n",
1746 VERSION
, STAP_EXTENDED_VERSION
);
1758 if (optind
!= argc
- 1)
1761 // Be sure dmesg mentions that we are loading bpf programs:
1762 kmsg
= fopen("/dev/kmsg", "w");
1764 fprintf(stderr
, "WARNING: could not open /dev/kmsg for diagnostics: %s\n", strerror(errno
));
1766 load_bpf_file(argv
[optind
]); // <- XXX initializes cpus online, PR24543 initializes default_cpu
1767 init_internal_globals();
1768 init_perf_transport();
1770 // Create a bpf_transport_context for userspace programs:
1771 unsigned ncpus
= map_attrs
[bpf::globals::perf_event_map_idx
].max_entries
;
1772 bpf_transport_context
uctx(default_cpu
, -1/*pmu_fd*/, ncpus
,
1773 map_attrs
, &map_fds
, output_f
,
1774 &interned_strings
, &aggregates
);
1776 if (create_group_fds() < 0)
1777 fatal("Error creating perf event group: %s\n", strerror(errno
));
1782 register_tracepoints();
1785 // Run the begin probes.
1787 bpf_interpret(prog_begin
->d_size
/ sizeof(bpf_insn
),
1788 static_cast<bpf_insn
*>(prog_begin
->d_buf
),
1791 // Wait for ^C; read BPF_OUTPUT events, copying them to output_f.
1792 signal(SIGINT
, (sighandler_t
)sigint
);
1793 signal(SIGTERM
, (sighandler_t
)sigint
);
1795 // PR22330: Listen for perf_events:
1796 std::thread(perf_event_loop
, pthread_self()).detach();
1798 // Now that the begin probe has run and the perf_event listener is active, enable the kprobes.
1799 ioctl(group_fd
, PERF_EVENT_IOC_ENABLE
, 0);
1801 // Wait for STP_EXIT message:
1802 while (!get_exit_status())
1805 // Disable the kprobes before deregistering and running exit probes.
1806 ioctl(group_fd
, PERF_EVENT_IOC_DISABLE
, 0);
1809 // Unregister all probes.
1810 unregister_kprobes(kprobes
.size());
1811 unregister_uprobes(uprobes
.size());
1812 unregister_timers(timers
.size());
1813 unregister_perf(perf_probes
.size());
1814 unregister_tracepoints(tracepoint_probes
.size());
1816 // We are now running exit probes, so ^C should exit immediately:
1818 signal(SIGINT
, (sighandler_t
)sigint
); // restore previously ignored signal
1819 signal(SIGTERM
, (sighandler_t
)sigint
);
1821 // Run the end+error probes.
1823 bpf_interpret(prog_end
->d_size
/ sizeof(bpf_insn
),
1824 static_cast<bpf_insn
*>(prog_end
->d_buf
),
1827 // Clean up transport layer allocations:
1828 for (std::vector
<bpf_transport_context
*>::iterator it
= transport_contexts
.begin();
1829 it
!= transport_contexts
.end(); it
++)
1832 elf_end(module_elf
);