This is the mail archive of the
systemtap@sourceware.org
mailing list for the systemtap project.
[PATCH] Optimize varuse_collecting_visitor::visit_embeddedcode by bool flags.
- From: "Yichun Zhang (agentzh)" <yichun at openresty dot com>
- To: systemtap at sourceware dot org
- Cc: "Yichun Zhang (agentzh)" <yichun at openresty dot com>
- Date: Sat, 16 Mar 2019 16:54:59 -0700
- Subject: [PATCH] Optimize varuse_collecting_visitor::visit_embeddedcode by bool flags.
The varuse_collecting_visitor::visit_embeddedcode() method currently
spends a lot of CPU time on the string comparison operations for the
memo_tagged_p() function. It is very common for the same embedded code
to be used for many times in the same stap script.
Now we cache the computation results via boolean flags in the
embedded_expr and embeddedcode objects. And the memo_tagged_p() function
overhead has reduced dramatically.
The stap analyzing time (phase 2) for one of our largest stap scripts
(lgcpath) now reduces by 93% (from 23s to 1.6s). Yay!
Just for the record, we sampled the *inversed* versions of the on-CPU
flame graphs against the stap process both before and after this
optimization:
Before:
http://openresty.org/misc/flamegraph/old-inv-stap-varuse-embeddedcode.svg
After:
http://openresty.org/misc/flamegraph/new-inv-stap-varuse-embeddedcode.svg
The flame graphs were sampled by stap as well. That's fun :)
---
staptree.cxx | 49 ++++++++++++++++++++++++++++++++++++++++++++---
staptree.h | 7 +++++++
tapset-python.cxx | 12 ++++++++++++
3 files changed, 65 insertions(+), 3 deletions(-)
diff --git a/staptree.cxx b/staptree.cxx
index b53fa7eac..0de30616f 100644
--- a/staptree.cxx
+++ b/staptree.cxx
@@ -365,6 +365,11 @@ bool memo_tagged_p (const interned_string& haystack, const string& needle)
+embedded_expr::embedded_expr(): varuse_collected(false), is_pure(false)
+{
+}
+
+
bool
embedded_expr::tagged_p (const char *tag) const
{
@@ -386,6 +391,12 @@ embedded_expr::tagged_p (const interned_string& tag) const
}
+embeddedcode::embeddedcode(): varuse_collected(false), is_pure(false),
+ is_unmangled(false)
+{
+}
+
+
bool
embeddedcode::tagged_p (const char *tag) const
{
@@ -2576,6 +2587,19 @@ varuse_collecting_visitor::visit_embeddedcode (embeddedcode *s)
{
assert (current_function); // only they get embedded code
+ if (s->varuse_collected)
+ {
+ if (s->is_unmangled)
+ current_function->mangle_oldstyle = true;
+
+ if (! s->is_pure)
+ embedded_seen = true;
+
+ return;
+ }
+
+ s->varuse_collected = true;
+
/* We need to lock globals that are accessed through embedded C code */
for (unsigned i = 0; i < session.globals.size(); i++)
{
@@ -2603,7 +2627,10 @@ varuse_collecting_visitor::visit_embeddedcode (embeddedcode *s)
// PR14524: Support old-style THIS->local syntax on per-function basis.
if (s->tagged_p ("/* unmangled */"))
- current_function->mangle_oldstyle = true;
+ {
+ s->is_unmangled = true;
+ current_function->mangle_oldstyle = true;
+ }
// We want to elide embedded-C functions when possible. For
// example, each $target variable access is expanded to an
@@ -2616,7 +2643,10 @@ varuse_collecting_visitor::visit_embeddedcode (embeddedcode *s)
// Also, explicit side-effect-free tapset functions will have this.
if (s->tagged_p ("/* pure */"))
- return;
+ {
+ s->is_pure = true;
+ return;
+ }
embedded_seen = true;
}
@@ -2626,6 +2656,16 @@ varuse_collecting_visitor::visit_embeddedcode (embeddedcode *s)
void
varuse_collecting_visitor::visit_embedded_expr (embedded_expr *e)
{
+ if (e->varuse_collected)
+ {
+ if (! e->is_pure)
+ embedded_seen = true;
+
+ return;
+ }
+
+ e->varuse_collected = true;
+
/* We need to lock globals that are accessed through embedded C code */
for (unsigned i = 0; i < session.globals.size(); i++)
{
@@ -2675,7 +2715,10 @@ varuse_collecting_visitor::visit_embedded_expr (embedded_expr *e)
// Also, explicit side-effect-free tapset functions will have this.
if (e->tagged_p ("/* pure */"))
- return;
+ {
+ e->is_pure = true;
+ return;
+ }
embedded_seen = true;
}
diff --git a/staptree.h b/staptree.h
index fce3de512..81356d4e9 100644
--- a/staptree.h
+++ b/staptree.h
@@ -184,6 +184,9 @@ struct literal_number: public literal
struct embedded_expr: public expression
{
interned_string code;
+ bool varuse_collected;
+ bool is_pure;
+ embedded_expr ();
bool tagged_p (const char *tag) const;
bool tagged_p (const std::string &tag) const;
bool tagged_p (const interned_string& tag) const;
@@ -706,6 +709,10 @@ std::ostream& operator << (std::ostream& o, const statement& k);
struct embeddedcode: public statement
{
interned_string code;
+ bool varuse_collected;
+ bool is_pure;
+ bool is_unmangled;
+ embeddedcode ();
bool tagged_p (const char *tag) const;
bool tagged_p (const std::string& tag) const;
bool tagged_p (const interned_string& tag) const;
diff --git a/tapset-python.cxx b/tapset-python.cxx
index 56a0f286f..2f67e7627 100644
--- a/tapset-python.cxx
+++ b/tapset-python.cxx
@@ -189,6 +189,12 @@ python_derived_probe_group::enroll (python_derived_probe* p)
python2_embedded = new embeddedcode;
s.embeds.push_back(python2_embedded);
}
+ else
+ {
+ python2_embedded->varuse_collected = false;
+ python2_embedded->is_pure = false;
+ python2_embedded->is_unmangled = false;
+ }
unsigned index = 0;
stringstream data;
@@ -213,6 +219,12 @@ python_derived_probe_group::enroll (python_derived_probe* p)
python3_embedded = new embeddedcode;
s.embeds.push_back(python3_embedded);
}
+ else
+ {
+ python3_embedded->varuse_collected = false;
+ python3_embedded->is_pure = false;
+ python3_embedded->is_unmangled = false;
+ }
unsigned index = 0;
stringstream data;
--
2.17.2