From d2d39de66492401c349907d59a63b9c95cfba2ad Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Thu, 17 Apr 2014 14:32:40 -0700 Subject: [PATCH] Prevent lock-recursion in _stp_ctl_send In rare cases, we may hit a probe while the transport layer is holding a spinlock, and that probe may call _stp_ctl_send which tries to grab the same and deadlocks. This is a bit easier to trigger on lockdep-enabled kernels with the lock_acquired tracepoint. This patch refactors the context->busy state management into get/put context, and those areas which grab probe-sensitive locks now wrap themselves with a context to stay comfortably free of probes. Tangentially, the dyninst side abandons the busy flag, as it already had a tls_context pointer to prevent direct recursion and a mutex for exclusive access across all processes. DEBUG_REENTRANCY is an unfortunate casualty, because we can't safely call _stp_warn when the busy context may be from those held locks. --- runtime/common_probe_context.h | 11 ++++--- runtime/dyninst/runtime_context.h | 8 +++-- runtime/linux/runtime_context.h | 19 +++++++++-- runtime/transport/control.c | 34 +++++++++++++++++--- runtime/transport/transport.c | 7 ++++ tapset-procfs.cxx | 2 -- tapsets.cxx | 53 +++++-------------------------- 7 files changed, 74 insertions(+), 60 deletions(-) diff --git a/runtime/common_probe_context.h b/runtime/common_probe_context.h index 1d4917391..da157615e 100644 --- a/runtime/common_probe_context.h +++ b/runtime/common_probe_context.h @@ -3,6 +3,7 @@ Available to C-based probe handlers as fields of the CONTEXT ptr. */ #ifdef __DYNINST__ + /* The index of this context structure with the array of allocated context structures. */ int data_index; @@ -15,16 +16,18 @@ pthread_mutex_t lock; /* The transport data for this context structure. */ struct _stp_transport_context_data transport_data; -#endif + +#else /* Used to indicate whether a probe context is in use. - Tested in the code entering the probe setup by common_probe_entry_prologue - and cleared by the common_probe_entry_epilogue code. When an early error - forces a goto probe_epilogue then needs an explicitly atomic_dec() first. + Tested in the probe prologue by _stp_runtime_entryfn_get_context + and cleared in the epilogue via _stp_runtime_entryfn_put_context. All context busy flags are tested on module unload to prevent unloading while some probe is still running. */ atomic_t busy; +#endif + /* The fully-resolved probe point associated with a currently running probe handler, including alias and wild-card expansion effects. aka stap_probe->pp. Setup by common_probe_entryfn_prologue. diff --git a/runtime/dyninst/runtime_context.h b/runtime/dyninst/runtime_context.h index 031ffc050..24034c745 100644 --- a/runtime/dyninst/runtime_context.h +++ b/runtime/dyninst/runtime_context.h @@ -152,7 +152,7 @@ static void _stp_runtime_entryfn_put_context(struct context *c) tls_context = NULL; pthread_mutex_unlock(&c->lock); } - // else, warn about bad state? + /* else, warn about bad state? */ return; } @@ -180,7 +180,10 @@ static void _stp_runtime_context_wait(void) for (i = 0; i < _stp_runtime_num_contexts; i++) { struct context *c = stp_session_context(i); - if (atomic_read (&c->busy)) { + int ret = pthread_mutex_trylock(&c->lock); + if (ret == 0) + pthread_mutex_unlock(&c->lock); + else if (ret == EBUSY) { holdon = 1; /* Just In case things are really stuck, let's print @@ -203,6 +206,7 @@ static void _stp_runtime_context_wait(void) hold_index = i; } } + /* else other pthread error == broken? what then? */ } #ifdef STAP_OVERRIDE_STUCK_CONTEXT diff --git a/runtime/linux/runtime_context.h b/runtime/linux/runtime_context.h index 18122749c..6b3c47e9e 100644 --- a/runtime/linux/runtime_context.h +++ b/runtime/linux/runtime_context.h @@ -45,12 +45,25 @@ static void _stp_runtime_contexts_free(void) static struct context * _stp_runtime_entryfn_get_context(void) { - return contexts[smp_processor_id()]; + struct context* __restrict__ c = NULL; + preempt_disable (); + c = contexts[smp_processor_id()]; + if (c != NULL) { + if (atomic_inc_return(&c->busy) == 1) + return c; + atomic_dec(&c->busy); + } + preempt_enable_no_resched(); + return NULL; } -static inline void _stp_runtime_entryfn_put_context(struct context *c __attribute__((unused))) +static inline void _stp_runtime_entryfn_put_context(struct context *c) { - /* Do nothing. */ + if (c && c == contexts[smp_processor_id()]) { + atomic_dec(&c->busy); + preempt_enable_no_resched(); + } + /* else, warn about bad state? */ return; } diff --git a/runtime/transport/control.c b/runtime/transport/control.c index 9c07b47da..4b9975144 100644 --- a/runtime/transport/control.c +++ b/runtime/transport/control.c @@ -416,6 +416,7 @@ static void _stp_ctl_free_buffer(struct _stp_buffer *bptr) on error. */ static int _stp_ctl_send(int type, void *data, unsigned len) { + struct context* __restrict__ c = NULL; struct _stp_buffer *bptr; unsigned long flags; unsigned hlen; @@ -439,23 +440,37 @@ static int _stp_ctl_send(int type, void *data, unsigned len) return 0; } + /* Prevent probe reentrancy while grabbing probe-used locks. + Since _stp_ctl_send may be called from arbitrary probe context, we + have to make sure that all locks it wants can't possibly be held + outside probe context too. This includes: + * _stp_ctl_ready_lock + * _stp_pool_q->lock + * _stp_ctl_special_msg_lock + We ensure this by grabbing the context here and everywhere else that + uses those locks, so such a probe will appear reentrant and be + skipped rather than deadlock. */ + c = _stp_runtime_entryfn_get_context(); + /* get a buffer from the free pool */ bptr = _stp_ctl_get_buffer(type, data, len); if (unlikely(bptr == NULL)) { /* Nothing else we can do... but let's not spam the kernel with these reports. */ /* printk(KERN_ERR "ctl_write_msg type=%d len=%d ENOMEM\n", type, len); */ + _stp_runtime_entryfn_put_context(c); return -ENOMEM; } - /* put it on the pool of ready buffers. Even though we are - holding a lock, calling list_add_tail() is safe here since - it will be totally inlined from the list.h header file so - we cannot recursively hit a kprobe inside the lock. */ + /* Put it on the pool of ready buffers. It's possible to recursively + hit a probe here, like a kprobe in NMI or the lock tracepoints, but + they will be squashed since we're holding the context busy. */ spin_lock_irqsave(&_stp_ctl_ready_lock, flags); list_add_tail(&bptr->list, &_stp_ctl_ready_q); spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags); + _stp_runtime_entryfn_put_context(c); + /* It would be nice if we could speed up the notification timer at this point, but calling mod_timer() at this point would bring in more locking issues... */ @@ -487,18 +502,24 @@ static int _stp_ctl_send_notify(int type, void *data, unsigned len) static ssize_t _stp_ctl_read_cmd(struct file *file, char __user *buf, size_t count, loff_t *ppos) { + struct context* __restrict__ c = NULL; struct _stp_buffer *bptr; int len; unsigned long flags; + /* Prevent probe reentrancy while grabbing probe-used locks. */ + c = _stp_runtime_entryfn_get_context(); + /* wait for nonempty ready queue */ spin_lock_irqsave(&_stp_ctl_ready_lock, flags); while (list_empty(&_stp_ctl_ready_q)) { spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags); + _stp_runtime_entryfn_put_context(c); if (file->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(_stp_ctl_wq, !list_empty(&_stp_ctl_ready_q))) return -ERESTARTSYS; + c = _stp_runtime_entryfn_get_context(); spin_lock_irqsave(&_stp_ctl_ready_lock, flags); } @@ -507,6 +528,9 @@ static ssize_t _stp_ctl_read_cmd(struct file *file, char __user *buf, list_del_init(&bptr->list); spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags); + /* NB: we can't hold the context across copy_to_user, as it might fault. */ + _stp_runtime_entryfn_put_context(c); + /* write it out */ len = bptr->len + 4; if (len > count || copy_to_user(buf, &bptr->type, len)) { @@ -521,7 +545,9 @@ static ssize_t _stp_ctl_read_cmd(struct file *file, char __user *buf, } /* put it on the pool of free buffers */ + c = _stp_runtime_entryfn_get_context(); _stp_ctl_free_buffer(bptr); + _stp_runtime_entryfn_put_context(c); return len; } diff --git a/runtime/transport/transport.c b/runtime/transport/transport.c index 0ddf514bd..d46bf3f04 100644 --- a/runtime/transport/transport.c +++ b/runtime/transport/transport.c @@ -296,11 +296,18 @@ static void _stp_ctl_work_callback(unsigned long val) { int do_io = 0; unsigned long flags; + struct context* __restrict__ c = NULL; + + /* Prevent probe reentrancy while grabbing probe-used locks. */ + c = _stp_runtime_entryfn_get_context(); spin_lock_irqsave(&_stp_ctl_ready_lock, flags); if (!list_empty(&_stp_ctl_ready_q)) do_io = 1; spin_unlock_irqrestore(&_stp_ctl_ready_lock, flags); + + _stp_runtime_entryfn_put_context(c); + if (do_io) wake_up_interruptible(&_stp_ctl_wq); diff --git a/tapset-procfs.cxx b/tapset-procfs.cxx index ec2a86e74..92cdebba7 100644 --- a/tapset-procfs.cxx +++ b/tapset-procfs.cxx @@ -286,7 +286,6 @@ procfs_derived_probe_group::emit_module_decls (systemtap_session& s) s.op->newline(1) << "atomic_set (session_state(), STAP_SESSION_ERROR);"; s.op->newline() << "_stp_exit ();"; s.op->newline(-1) << "}"; - s.op->newline() << "atomic_dec (& c->busy);"; s.op->newline() << "goto probe_epilogue;"; s.op->newline(-1) << "}"; @@ -344,7 +343,6 @@ procfs_derived_probe_group::emit_module_decls (systemtap_session& s) s.op->newline(1) << "atomic_set (session_state(), STAP_SESSION_ERROR);"; s.op->newline() << "_stp_exit ();"; s.op->newline(-1) << "}"; - s.op->newline() << "atomic_dec (& c->busy);"; s.op->newline() << "goto probe_epilogue;"; s.op->newline(-1) << "}"; diff --git a/tapsets.cxx b/tapsets.cxx index 71f3de2ea..b5e04ddfc 100644 --- a/tapsets.cxx +++ b/tapsets.cxx @@ -127,9 +127,7 @@ common_probe_entryfn_prologue (systemtap_session& s, } s.op->newline() << "#endif"; - s.op->newline() << "#if INTERRUPTIBLE"; - s.op->newline() << "preempt_disable ();"; - s.op->newline() << "#else"; + s.op->newline() << "#if !INTERRUPTIBLE"; s.op->newline() << "local_irq_save (flags);"; s.op->newline() << "#endif"; @@ -154,41 +152,16 @@ common_probe_entryfn_prologue (systemtap_session& s, s.op->indent(-1); s.op->newline() << "c = _stp_runtime_entryfn_get_context();"; - if (s.runtime_usermode_p()) - { - s.op->newline() << "if (!c) {"; - s.op->newline(1) << "#if !INTERRUPTIBLE"; - s.op->newline() << "atomic_inc (skipped_count());"; - s.op->newline() << "#endif"; - s.op->newline() << "#ifdef STP_TIMING"; - s.op->newline() << "atomic_inc (skipped_count_reentrant());"; - s.op->newline() << "#ifdef DEBUG_REENTRANCY"; - s.op->newline() << "_stp_warn (\"Skipped %s\\n\", " << probe << "->pp);"; - s.op->newline() << "#endif"; - s.op->newline() << "#endif"; - s.op->newline() << "goto probe_epilogue;"; - s.op->newline(-1) << "}"; - } - - s.op->newline() << "if (atomic_inc_return (& c->busy) != 1) {"; + s.op->newline() << "if (!c) {"; s.op->newline(1) << "#if !INTERRUPTIBLE"; s.op->newline() << "atomic_inc (skipped_count());"; s.op->newline() << "#endif"; s.op->newline() << "#ifdef STP_TIMING"; s.op->newline() << "atomic_inc (skipped_count_reentrant());"; - s.op->newline() << "#ifdef DEBUG_REENTRANCY"; - s.op->newline() << "_stp_warn (\"Skipped %s due to %s residency on cpu %u\\n\", " - << probe << "->pp, c->probe_point ?: \"?\", smp_processor_id());"; - // NB: There is a conceivable race condition here with reading - // c->probe_point, knowing that this other probe is sort of running. - // However, in reality, it's interrupted. Plus even if it were able - // to somehow start again, and stop before we read c->probe_point, - // at least we have that ?: "?" bit in there to avoid a NULL deref. - s.op->newline() << "#endif"; s.op->newline() << "#endif"; - s.op->newline() << "atomic_dec (& c->busy);"; s.op->newline() << "goto probe_epilogue;"; s.op->newline(-1) << "}"; + s.op->newline(); s.op->newline() << "c->last_stmt = 0;"; s.op->newline() << "c->last_error = 0;"; @@ -337,18 +310,9 @@ common_probe_entryfn_epilogue (systemtap_session& s, s.op->newline(-1) << "}"; - s.op->newline() << "atomic_dec (&c->busy);"; - s.op->newline(-1) << "probe_epilogue:"; // context is free s.op->indent(1); - // In dyninst mode, we're not done with the context yet, since - // _stp_error() still needs a context structure (since the log - // buffers are stored there). - if (!s.runtime_usermode_p()) - { - s.op->newline() << "_stp_runtime_entryfn_put_context(c);"; - } if (! s.suppress_handler_errors) // PR 13306 { // Check for excessive skip counts. @@ -358,15 +322,16 @@ common_probe_entryfn_epilogue (systemtap_session& s, s.op->newline(-1) << "}"; } - s.op->newline() << "#if INTERRUPTIBLE"; - s.op->newline() << "preempt_enable_no_resched ();"; - s.op->newline() << "#else"; + // We mustn't release the context until after all _stp_error(), so dyninst + // mode can still access the log buffers stored therein. + s.op->newline() << "_stp_runtime_entryfn_put_context(c);"; + + s.op->newline() << "#if !INTERRUPTIBLE"; s.op->newline() << "local_irq_restore (flags);"; s.op->newline() << "#endif"; if (s.runtime_usermode_p()) { - s.op->newline() << "_stp_runtime_entryfn_put_context(c);"; s.op->newline() << "errno = _stp_saved_errno;"; } @@ -8244,7 +8209,6 @@ uprobe_derived_probe_group::emit_module_utrace_decls (systemtap_session& s) << "sup->spec_index >= " << probes.size() << ") {"; s.op->newline(1) << "_stp_error (\"bad spec_index %d (max " << probes.size() << "): %s\", sup->spec_index, c->probe_point);"; - s.op->newline() << "atomic_dec (&c->busy);"; s.op->newline() << "goto probe_epilogue;"; s.op->newline(-1) << "}"; s.op->newline() << "c->uregs = regs;"; @@ -8274,7 +8238,6 @@ uprobe_derived_probe_group::emit_module_utrace_decls (systemtap_session& s) << "sup->spec_index >= " << probes.size() << ") {"; s.op->newline(1) << "_stp_error (\"bad spec_index %d (max " << probes.size() << "): %s\", sup->spec_index, c->probe_point);"; - s.op->newline() << "atomic_dec (&c->busy);"; s.op->newline() << "goto probe_epilogue;"; s.op->newline(-1) << "}"; -- 2.43.5