From 5826dc359ef5a168b29cb3f740308e242dba2ebd Mon Sep 17 00:00:00 2001 From: "Frank Ch. Eigler" Date: Mon, 25 Jun 2018 12:34:43 -0400 Subject: [PATCH] PR23160,PR14690: 32-on-64 bit fixes After reported crashes with the syscalls.* test cases, found that 32-on-64 bits were b0rked, because pt_regs* addresses were being truncated and yet later dereferenced in kernel space. To simplify analysis, added a pt_regs *sregs to the common probe context, which signifies 'syscall mode' register dumps. This is different from normal kregs (kernel-space, normal abi, 64-bit-only) and uregs (user-space, normal abi, either 32- or 64-bit), and needs custom processing in _stp_syscall_nr and especially _stp_arg2. The sregs-setter embedded-C function __set_syscall_pt_regs(r) needs to be private/tapset-guru, but we lack proper /* markup */ for that particular mode. So that function currently needs to be replicated as private inside each sysc_* file, ugh. Not for long though. After this patch, while this doesn't quite pass, but the read parts look good: sudo make installcheck RUNTESTFLAGS=nd_syscall.exp\ syscall.exp CHECK_ONLY="readwrite" --- runtime/common_probe_context.h | 5 +-- tapset/linux/aux2_syscalls.stp | 15 +++++---- tapset/linux/sysc_read.stp | 28 +++++++++------- tapset/x86_64/registers.stp | 58 ++++++++++++++++++++++++++++++++-- tapsets.cxx | 3 +- 5 files changed, 85 insertions(+), 24 deletions(-) diff --git a/runtime/common_probe_context.h b/runtime/common_probe_context.h index 25acfa5a3..5ab917781 100644 --- a/runtime/common_probe_context.h +++ b/runtime/common_probe_context.h @@ -113,8 +113,9 @@ const char *last_stmt; /* Set when probe handler gets pt_regs handed to it. kregs holds the kernel registers when availble. uregs holds the user registers when available. uregs are at least available when user_mode_p == 1. */ -struct pt_regs *kregs; -struct pt_regs *uregs; +struct pt_regs *kregs; /* !user_mode_p */ +struct pt_regs *uregs; /* user_mode_p */ +struct pt_regs *sregs; /* syscall mode only, untrustworthy since user-controlled */ /* unwaddr is caching unwound address in each probe handler on ia64. */ #if defined __ia64__ diff --git a/tapset/linux/aux2_syscalls.stp b/tapset/linux/aux2_syscalls.stp index e6c118a32..f5ca13ffa 100644 --- a/tapset/linux/aux2_syscalls.stp +++ b/tapset/linux/aux2_syscalls.stp @@ -2,18 +2,20 @@ pulling in unnecessary larger pass-2 computations. */ -/* When a syscall interjection mechanism gives us a pt_regs* - structure for the syscall parameters/context, we can pretend - as though it were a user-space probe. */ -function __set_usermode_pt_regs(r) +/* Some syscall interjection mechanisms give us a pt_regs* structure + for the syscall parameters/context. */ +private function __set_syscall_pt_regs(r) %{ - c->uregs = (void*)STAP_ARG_r; - c->user_mode_p = 1; + CONTEXT->sregs = (void*)STAP_ARG_r; %} function _stp_syscall_nr:long () %{ /* pure */ + if (CONTEXT->sregs) { + /* NB: same abi? */ + STAP_RETVALUE = _stp_syscall_get_nr(current, CONTEXT->sregs); + } else { struct pt_regs *regs = _stp_current_pt_regs(); if (!regs) { CONTEXT->last_error = ("Cannot access syscall number" @@ -21,5 +23,6 @@ function _stp_syscall_nr:long () return; } STAP_RETVALUE = _stp_syscall_get_nr(current, regs); + } %} diff --git a/tapset/linux/sysc_read.stp b/tapset/linux/sysc_read.stp index 7ba0cc0a6..c5098a796 100644 --- a/tapset/linux/sysc_read.stp +++ b/tapset/linux/sysc_read.stp @@ -14,7 +14,6 @@ @define _SYSCALL_READ_REGARGS %( - asmlinkage() fd = int_arg(1) buf_uaddr = pointer_arg(2) count = ulong_arg(3) @@ -79,6 +78,7 @@ probe nd1_syscall.read = %) { @_SYSCALL_READ_NAME + asmlinkage() @_SYSCALL_READ_REGARGS @_SYSCALL_READ_ARGSTR } @@ -91,20 +91,25 @@ probe __nd1_syscall.read = kprobe.function("sys_read") } /* kernel 4.17+ */ +private function __set_syscall_pt_regs(r) +%{ + CONTEXT->sregs = (void*)STAP_ARG_r; +%} + + probe nd2_syscall.read = kprobe.function("__*_sys_read") /* _x64_ etc. */ { - asmlinkage() - __set_usermode_pt_regs(pointer_arg(1)) + __set_syscall_pt_regs(pointer_arg(1)) @_SYSCALL_READ_NAME @_SYSCALL_READ_REGARGS @_SYSCALL_READ_ARGSTR } - /* kernel 3.5+, but undesirable because of it affects all syscalls */ +/* kernel 3.5+, but undesirable because of it affects all syscalls */ probe tp_syscall.read = kernel.trace("sys_enter") { - if ($id != @const("__NR_read")) next; - __set_usermode_pt_regs($regs) + __set_syscall_pt_regs($regs) + @__syscall_gate(@const("__NR_read")) @_SYSCALL_READ_NAME @_SYSCALL_READ_REGARGS @_SYSCALL_READ_ARGSTR @@ -139,17 +144,16 @@ probe __nd1_syscall.read.return = kprobe.function("sys_read").return /* kernel 4.17+ */ probe nd2_syscall.read.return = kprobe.function("__*_sys_read").return /* _x64_ etc. */ { - asmlinkage() - __set_usermode_pt_regs(pointer_arg(1)) @_SYSCALL_READ_NAME - retstr = returnstr(1) + retstr = returnstr(1) /* NB: not in the $regs */ } + /* kernel 3.5+, but undesirable because of it affects all syscalls */ probe tp_syscall.read.return = kernel.trace("sys_exit") { - /* if ($id != @const("__NR_read")) next; */ /* XXX ... whoops, no $id passed for sys_exit */ - __set_usermode_pt_regs($regs) + __set_syscall_pt_regs($regs) + @__syscall_gate(@const("__NR_read")) @_SYSCALL_READ_NAME - retstr = returnstr(1) + retstr = return_str(1,$ret) } diff --git a/tapset/x86_64/registers.stp b/tapset/x86_64/registers.stp index 864983bd9..61701d038 100644 --- a/tapset/x86_64/registers.stp +++ b/tapset/x86_64/registers.stp @@ -120,7 +120,55 @@ function _stp_arg2:long (argnum:long, sign_extend:long, truncate:long, struct pt_regs *regs; int result, n, nr_regargs; size_t argsz = sizeof(long); - regs = (CONTEXT->user_mode_p ? CONTEXT->uregs : CONTEXT->kregs); + + if (CONTEXT->sregs) { /* syscall-in-pt_regs mode: kernel 4.17+ or similar */ + void* valptr; + int is_32bit = _stp_is_compat_task(); + regs = CONTEXT->sregs; + + /* NB: kread() is probably unnecessary paranoia w.r.t. the sregs pointer. */ + if (is_32bit) switch (STAP_ARG_argnum) { + /* SC_IA32_REGS_TO_ARGS / SC_X86_64_REGS_TO_ARGS */ + case 1: valptr = &(regs->bx); break; + case 2: valptr = &(regs->cx); break; + case 3: valptr = &(regs->dx); break; + case 4: valptr = &(regs->si); break; + case 5: valptr = &(regs->di); break; + case 6: valptr = &(regs->bp); break; + default: + goto bad_argnum; + } + else switch (STAP_ARG_argnum) { + /* SC_IA32_REGS_TO_ARGS / SC_X86_64_REGS_TO_ARGS */ + case 1: valptr = &(regs->di); break; + case 2: valptr = &(regs->si); break; + case 3: valptr = &(regs->dx); break; + case 4: valptr = &(regs->r10); break; + case 5: valptr = &(regs->r8); break; + case 6: valptr = &(regs->r9); break; + default: + goto bad_argnum; + } + + val = kread((long *)valptr); + + /* Here, we do sign extension / truncation, ignoring CONTEXT->user_mode_p. */ + if ((STAP_ARG_truncate || _stp_is_compat_task()) + && !STAP_ARG_force64) { + if (STAP_ARG_sign_extend) + STAP_RETVALUE = (int64_t) __stp_sign_extend32(val); + else + /* High bits may be garbage. */ + STAP_RETVALUE = (int64_t) (val & 0xffffffff); + } else + STAP_RETVALUE = (int64_t) val; + + return; + } + + // fall through in the non-sycall-mode case + + regs = (CONTEXT->user_mode_p ? CONTEXT->uregs : CONTEXT->kregs); STAP_RETVALUE = 0; if (!regs) { @@ -140,11 +188,13 @@ function _stp_arg2:long (argnum:long, sign_extend:long, truncate:long, nr_regargs = 6; } else nr_regargs = (CONTEXT->regparm & _STP_REGPARM_MASK); + if (!STAP_ARG_force64 && CONTEXT->user_mode_p && _stp_is_compat_task()) { argsz = sizeof(int); result = _stp_get_arg32_by_number(n, nr_regargs, regs, &val); - } else + } else { result = _stp_get_arg64_by_number(n, nr_regargs, regs, &val); + } switch (result) { case 0: /* Arg is in register. */ @@ -169,7 +219,9 @@ function _stp_arg2:long (argnum:long, sign_extend:long, truncate:long, default: goto bad_argnum; } - if ((STAP_ARG_truncate || _stp_is_compat_task()) + + /* Sign-extend etc. as needed, depending on process 32-bitness. */ + if ((STAP_ARG_truncate || (CONTEXT->user_mode_p && _stp_is_compat_task())) && !STAP_ARG_force64) { if (STAP_ARG_sign_extend) STAP_RETVALUE = (int64_t) __stp_sign_extend32(val); diff --git a/tapsets.cxx b/tapsets.cxx index ef19cac1b..5ba6f50b9 100644 --- a/tapsets.cxx +++ b/tapsets.cxx @@ -186,6 +186,7 @@ common_probe_entryfn_prologue (systemtap_session& s, s.op->newline() << "c->nesting = -1;"; // NB: PR10516 packs locals[] tighter s.op->newline() << "c->uregs = 0;"; s.op->newline() << "c->kregs = 0;"; + s.op->newline() << "c->sregs = 0;"; s.op->newline() << "#if defined __ia64__"; s.op->newline() << "c->unwaddr = 0;"; s.op->newline() << "#endif"; @@ -198,7 +199,7 @@ common_probe_entryfn_prologue (systemtap_session& s, s.op->newline() << "c->probe_type = " << probe_type << ";"; // reset Individual Probe State union s.op->newline() << "memset(&c->ips, 0, sizeof(c->ips));"; - s.op->newline() << "c->user_mode_p = 0; c->full_uregs_p = 0;"; + s.op->newline() << "c->user_mode_p = 0; c->full_uregs_p = 0; "; s.op->newline() << "#ifdef STAP_NEED_REGPARM"; // i386 or x86_64 register.stp s.op->newline() << "c->regparm = 0;"; s.op->newline() << "#endif"; -- 2.43.5