#define VALN s
#define VALSTOR char value[MAP_STRING_LENGTH]
#define MAP_GET_VAL(node) ((node)->value)
-#define MAP_SET_VAL(map,node,val,add) _new_map_set_str(map,MAP_GET_VAL(node),val,add)
-#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add)
+#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_str(map,MAP_GET_VAL(node),val,add)
+#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add,0,0,0,0,0)
#define NULLRET ""
#elif VALUE_TYPE == INT64
#define VALTYPE int64_t
#define VALN i
#define VALSTOR int64_t value
#define MAP_GET_VAL(node) ((node)->value)
-#define MAP_SET_VAL(map,node,val,add) _new_map_set_int64(map,&MAP_GET_VAL(node),val,add)
-#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add)
+#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_int64(map,&MAP_GET_VAL(node),val,add)
+#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add,0,0,0,0,0)
#define NULLRET (int64_t)0
#elif VALUE_TYPE == STAT
#define VALTYPE stat_data*
#define VALN x
#define VALSTOR stat_data value
#define MAP_GET_VAL(node) (&(node)->value)
-#define MAP_SET_VAL(map,node,val,add) _new_map_set_stat(map,MAP_GET_VAL(node),val,add)
+#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_stat(map,MAP_GET_VAL(node),val,add,s1,s2,s3,s4,s5)
#define MAP_COPY_VAL(map,node,val,add) _new_map_copy_stat(map,MAP_GET_VAL(node),val,add)
#define NULLRET (stat_data*)0
#else
#endif /* VALUE_TYPE */
-static int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add)
+static inline int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add, int s1, int s2, int s3, int s4, int s5)
{
unsigned int hv;
struct mhlist_head *head;
mhlist_for_each_entry(n, e, head, node.hnode) {
if (KEY_EQ_P(n)) {
- return MAP_SET_VAL(map, n, val, add);
+ return MAP_SET_VAL(map, n, val, add, s1, s2, s3, s4, s5);
}
}
/* key not found */
if (n == NULL)
return -1;
KEYCPY(n);
- return MAP_SET_VAL(map, n, val, 0);
+ return MAP_SET_VAL(map, n, val, 0, s1, s2, s3, s4, s5);
}
static int KEYSYM(_stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val)
{
- return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 0);
+ return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 0, 1, 1, 1, 1, 1);
}
static int KEYSYM(_stp_map_add) (MAP map, ALLKEYSD(key), VSTYPE val)
{
- return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 1);
+ return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 1, 1, 1, 1, 1, 1);
}
return 0;
}
-static int _new_map_set_stat (MAP map, struct stat_data *sd, int64_t val, int add)
+static int _new_map_set_stat (MAP map, struct stat_data *sd, int64_t val, int add, int s1, int s2, int s3, int s4, int s5)
{
if (!add) {
Hist st = &map->hist;
}
(&map->hist)->bit_shift = map->bit_shift;
(&map->hist)->stat_ops = map->stat_ops;
- __stp_stat_add (&map->hist, sd, val);
+ __stp_stat_add (&map->hist, sd, val, s1, s2, s3, s4, s5);
return 0;
}
static MAP _stp_pmap_agg (PMAP pmap, map_update_fn update, map_cmp_fn cmp);
static struct map_node *_stp_new_agg(MAP agg, struct mhlist_head *ahead,
struct map_node *ptr, map_update_fn update);
-static int _new_map_set_stat (MAP map, struct stat_data *dst, int64_t val, int add);
+static int _new_map_set_stat (MAP map, struct stat_data *dst, int64_t val, int add, int s1, int s2, int s3, int s4, int s5);
static int _new_map_copy_stat (MAP map, struct stat_data *dst, struct stat_data *src, int add);
static void _stp_map_sort (MAP map, int keynum, int dir, map_get_key_fn get_key);
static void _stp_map_sortn(MAP map, int n, int keynum, int dir, map_get_key_fn get_key);
{
int res;
MAP m = _stp_pmap_get_map (pmap, MAP_GET_CPU());
- res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 0);
+ res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 0, 1, 1, 1, 1, 1);
MAP_PUT_CPU();
return res;
}
-static int KEYSYM(_stp_pmap_add) (PMAP pmap, ALLKEYSD(key), VSTYPE val)
+static inline int KEYSYM(_stp_pmap_add) (PMAP pmap, ALLKEYSD(key), VSTYPE val, int s1, int s2, int s3, int s4, int s5)
{
int res;
MAP m = _stp_pmap_get_map (pmap, MAP_GET_CPU());
m->bit_shift = pmap->bit_shift;
m->stat_ops = pmap->stat_ops;
- res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 1);
+ res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 1, s1, s2, s3, s4, s5);
MAP_PUT_CPU();
return res;
}
_stp_print_flush();
}
-static void __stp_stat_add(Hist st, stat_data *sd, int64_t val)
+static inline void __stp_stat_add(Hist st, stat_data *sd, int64_t val,
+ int stat_op_count, int stat_op_sum, int stat_op_min,
+ int stat_op_max, int stat_op_variance)
{
int n;
int delta = 0;
- /*
- * Below, we use Welford's online algorithm for computing variance.
- * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- */
sd->shift = st->bit_shift;
sd->stat_ops = st->stat_ops;
if (sd->count == 0) {
sd->avg_s = val << sd->shift;
sd->_M2 = 0;
} else {
- sd->count++;
- sd->sum += val;
- if (val > sd->max)
+ if(stat_op_count)
+ sd->count++;
+ if(stat_op_sum)
+ sd->sum += val;
+ if (stat_op_min && (val > sd->max))
sd->max = val;
- if (val < sd->min)
+ if (stat_op_max && (val < sd->min))
sd->min = val;
/*
- * Following is an optimization that improves performance
- * in case @variance() isn't used with given global.
- *
- * Note that this doesn't affect computing of @avg(), which
- * happens within the per-CPU aggregation functions.
+ * Below, we use Welford's online algorithm for computing variance.
+ * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
*/
- if (sd->stat_ops & STAT_OP_VARIANCE) {
+ if (stat_op_variance) {
delta = (val << sd->shift) - sd->avg_s;
sd->avg_s += _stp_div64(NULL, delta, sd->count);
sd->_M2 += delta * ((val << sd->shift) - sd->avg_s);
}
/** Add to a Stat.
- * Add an int64 to a Stat.
+ * Add an int64 to a Stat, and for optimization purposes specify which
+ * statistical operators are bound to given Stat. Set all of stat_op*
+ * to 1 if unsure. Note that @avg() is being evaluated separately based
+ * on @sum and @count within the code directly generated by the translator.
*
* @param st Stat
* @param val Value to add
+ * @param stat_op_count int
+ * @param stat_op_sum int
+ * @param stat_op_min int
+ * @param stat_op_max int
+ * @param stat_op_variance int
+ *
*/
-static void _stp_stat_add (Stat st, int64_t val)
+static inline void _stp_stat_add (Stat st, int64_t val, int stat_op_count,
+ int stat_op_sum, int stat_op_min,
+ int stat_op_max, int stat_op_variance)
{
stat_data *sd = _stp_stat_per_cpu_ptr (st, STAT_GET_CPU());
STAT_LOCK(sd);
- __stp_stat_add (&st->hist, sd, val);
+ __stp_stat_add (&st->hist, sd, val, stat_op_count, stat_op_sum,
+ stat_op_min, stat_op_max, stat_op_variance);
STAT_UNLOCK(sd);
STAT_PUT_CPU();
}
-
static void _stp_stat_clear_data (Stat st, stat_data *sd)
{
int j;
}
s.op->newline() << "#ifdef STP_TIMING";
- s.op->newline() << "if (likely (stat)) _stp_stat_add(stat, cycles_elapsed);";
+ // STP_TIMING requires min, max, avg (and thus count and sum), but not variance.
+ s.op->newline() << "if (likely (stat)) _stp_stat_add(stat, cycles_elapsed, 1, 1, 1, 1, 0);";
s.op->newline() << "#endif";
if (overload_processing && !s.runtime_usermode_p())
--- /dev/null
+# This is a test for stat run time optimizations.
+# See corresponding .stp file for details.
+
+set test "optim_stats"
+
+if {![installtest_p]} {
+ untested $test
+ return
+}
+
+for {set i 1} {$i <= 2} {incr i} {
+ foreach runtime [get_runtime_list] {
+ if {$runtime != ""} {
+ spawn stap --runtime=$runtime -g --suppress-time-limits $srcdir/$subdir/$test$i.stp
+ } else {
+ spawn stap -g --suppress-time-limits $srcdir/$subdir/$test$i.stp
+ }
+
+ expect {
+ -timeout 300
+ -re {^IGNORE[^\r\n]+\r\n} { exp_continue }
+ -re {^PASS test1[^\r\n]+\r\n} { pass "$test$i.stp subtest1 $runtime"; exp_continue }
+ -re {^PASS test2[^\r\n]+\r\n} { pass "$test$i.stp subtest2 $runtime"; exp_continue }
+ -re {^FAIL test1[^\r\n]+\r\n} { fail "$test$i.stp subtest1 $runtime"; exp_continue }
+ -re {^FAIL test2[^\r\n]+\r\n} { fail "$test$i.stp subtest2 $runtime"; exp_continue }
+ timeout {fail "$test: unexpected timeout"}
+ eof { }
+ }
+ catch {close}; catch {wait}
+ }
+}
--- /dev/null
+/*
+ * This is a test for stat run time optimizations. Each stat has a list of
+ * requested statistical operators. For instance, if a script uses stat x,
+ * and only refers to @avg(x), then the list of requested statistical operators
+ * for given stat x is @count, @sum, and @avg. The @min(x) and @max(x) are
+ * not in the list, and thus do not need to be avaluated at the _stp_stat_add()
+ * time (iow, at the x<<<val time). Optimization based on this makes the
+ * systemtap runtime run faster. The goal of this test is to verify that this
+ * sort of optimizations actually works in a measurable way.
+ *
+ * At the moment, the available stat operators are @count, @sum, @min, @max,
+ * @avg, and @variance. The most computionally expensive is @variance.
+ * Detecting the variance optimization is quite simple. Other operators are
+ * computionally cheap and thus detecting their respective optimizations is
+ * somewhat tricky on a multiuser/multitasking system, where so many irrelevant
+ * bearings are affecting our fragile measurement. In this case we must set
+ * the treshold distinguishing between the PASS and FAIL pretty carefully. Just
+ * slightly above the "noise". This testcase is sentenced to be fragile by it's
+ * nature though.
+ *
+ * One of the basic assumptions for this sort of test is that if we compare stats
+ * having identical list of requested statistical operators, we should get very
+ * similar results. It turns out, that to achieve this, we can't simply feed the
+ * values into measured stats in straightforward order. Instead, we need to baffle
+ * the optimizations under the hood by complicating the "feed" order slightly.
+ * After verifying this assumption, we can start comparing different stats.
+ *
+ * Since verifying the @variance optimization is much easirer and doesn't require
+ * so many time consuming iterations to get reasonable results, this test is
+ * divided into two parts, TEST 1, and TEST 2, where in TEST 1 we focus on the
+ * optimization for @count, @sum, @min, and @max, and then, in TEST 2, we test the
+ * @variance optimization separately. This makes the test itself run faster.
+ *
+ */
+
+@define RANDCNT %( 200000 %)
+@define RANDMAX %( 1000 %)
+@define ITERS %( 1500 %)
+
+@define feed(agg, tagg)
+%(
+ t = time()
+ foreach(k in randvals)
+ @agg <<< k
+ @tagg += time() - t
+%)
+
+global x, tx = 0, y, ty = 0
+global a, ta = 0, b, tb = 0
+global randvals[@RANDCNT]
+
+function time() { return gettimeofday_us() }
+
+probe begin
+{
+ /* TEST 1: test optimizations for @count, @sum, @min, and @max. */
+
+ for (i=0; i<@ITERS; i++)
+ {
+
+ for (j=0; j<@RANDCNT; j++)
+ randvals[j] = randint(@RANDMAX)
+
+ /* The "ordering dance" described above happens here */
+ if(i%2)
+ {
+ @feed(x, tx)
+ @feed(y, ty)
+ }
+ else
+ {
+ @feed(y, ty)
+ @feed(x, tx)
+ }
+ }
+
+ /*
+ * We need to print the stats out to avoid compiler elision.
+ * The list of stats mentioned below makes the actual difference
+ * between stats under test and is the gist of this test. The test
+ * should show no measurable shrinkage, if the below list doesn't
+ * differ for measured stats.
+ */
+ printdln(" ", "IGNORE", @count(x))
+ printdln(" ", "IGNORE", @count(y), @sum(y), @min(y), @max(y))
+
+ /* Measured shrinkage [%] */
+ shrinkage = (ty-tx)*100/ty
+
+ /*
+ * Treshold [%] (just slightly above the "noise") The usual values were
+ * around 8% at the time of writing this test using gcc-6.2.1-1.fc26.x86_64.
+ * But deeper testing shows, that on other arches, namely on power and arm,
+ * gcc is not so good optimizing the runtime code, so here we only check
+ * for regressions.
+ */
+ treshold = 0
+
+ printf("%s test1 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage)
+
+
+ /* TEST 2: test optimizations for @variance. */
+
+ for (i=0; i<(@ITERS / 4); i++)
+ {
+
+ for (j=0; j<@RANDCNT; j++)
+ randvals[j] = randint(@RANDMAX)
+
+ if(i%2)
+ {
+ @feed(a, ta)
+ @feed(b, tb)
+ }
+ else
+ {
+ @feed(b, tb)
+ @feed(a, ta)
+ }
+ }
+
+ printdln(" ", "IGNORE", @count(a))
+ printdln(" ", "IGNORE", @variance(b))
+
+ shrinkage = (tb-ta)*100/tb
+
+ /*
+ * Treshold [%], for this test the usual value is around 68% at the time
+ * of writing this test.
+ */
+ treshold = 20
+
+ printf("%s test2 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage)
+
+ exit()
+}
--- /dev/null
+/*
+ * Analogy to optim_stats1.stp, but for pmaps. See optim_stats1.stp for comments.
+ */
+
+@define RANDCNT %( 200000 %)
+@define RANDMAX %( 1000 %)
+@define ITERS %( 1500 %)
+
+@define feed(agg, tagg)
+%(
+ t = time()
+ foreach(k in randvals)
+ @agg <<< k
+ @tagg += time() - t
+%)
+
+global x, tx = 0, y, ty = 0
+global a, ta = 0, b, tb = 0
+global randvals[@RANDCNT]
+
+function time() { return gettimeofday_us() }
+
+probe begin
+{
+ /* TEST 1 */
+
+ for (i=0; i<@ITERS; i++)
+ {
+
+ for (j=0; j<@RANDCNT; j++)
+ randvals[j] = randint(@RANDMAX)
+
+ if(i%2)
+ {
+ @feed(x[1], tx)
+ @feed(y[1], ty)
+ }
+ else
+ {
+ @feed(y[1], ty)
+ @feed(x[1], tx)
+ }
+ }
+
+ printdln(" ", "IGNORE", @count(x[1]))
+ printdln(" ", "IGNORE", @count(y[1]), @sum(y[1]), @min(y[1]), @max(y[1]))
+
+ shrinkage = (ty-tx)*100/ty
+
+ treshold = 0
+
+ printf("%s test1 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage)
+
+
+ /* TEST 2 */
+
+ for (i=0; i<(@ITERS / 4); i++)
+ {
+
+ for (j=0; j<@RANDCNT; j++)
+ randvals[j] = randint(@RANDMAX)
+
+ if(i%2)
+ {
+ @feed(a[1], ta)
+ @feed(b[1], tb)
+ }
+ else
+ {
+ @feed(b[1], tb)
+ @feed(a[1], ta)
+ }
+ }
+
+ printdln(" ", "IGNORE", @count(a[1]))
+ printdln(" ", "IGNORE", @variance(b[1]))
+
+ shrinkage = (tb-ta)*100/tb
+
+ treshold = 20
+
+ printf("%s test2 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage)
+
+ exit()
+}
return result;
}
+ string stat_op_parms() const
+ {
+ string result = "";
+ result += (sd.stat_ops & (STAT_OP_COUNT|STAT_OP_AVG|STAT_OP_VARIANCE)) ? "1, " : "0, ";
+ result += (sd.stat_ops & (STAT_OP_SUM|STAT_OP_AVG|STAT_OP_VARIANCE)) ? "1, " : "0, ";
+ result += (sd.stat_ops & STAT_OP_MIN) ? "1, " : "0, ";
+ result += (sd.stat_ops & STAT_OP_MAX) ? "1, " : "0, ";
+ result += (sd.stat_ops & STAT_OP_VARIANCE) ? "1" : "0";
+ return result;
+ }
+
string calculate_aggregate() const
{
if (!is_parallel())
// impedance matching: empty strings -> NULL
if (type() == pe_stats)
- res += (call_prefix("add", indices) + ", " + val.value() + ")");
+ res += (call_prefix("add", indices) + ", " + val.value() + ", " + stat_op_parms() + ")");
else
throw SEMANTIC_ERROR(_("adding a value of an unsupported map type"));
o->newline(1) << "? ((int32_t)cycles_atend - (int32_t)cycles_atstart)";
o->newline() << ": (~(int32_t)0) - (int32_t)cycles_atstart + (int32_t)cycles_atend + 1;";
o->indent(-1);
- o->newline() << "_stp_stat_add(g_refresh_timing, cycles_elapsed);";
+ // STP_TIMING requires min, max, avg (and thus count and sum), but not variance.
+ o->newline() << "_stp_stat_add(g_refresh_timing, cycles_elapsed, 1, 1, 1, 1, 0);";
o->newline(-1) << "}";
o->newline() << "#endif";
}
}
else if (op == "<<<")
{
+ int stat_op_count = lval.sdecl().stat_ops & (STAT_OP_COUNT|STAT_OP_AVG|STAT_OP_VARIANCE);
+ int stat_op_sum = lval.sdecl().stat_ops & (STAT_OP_SUM|STAT_OP_AVG|STAT_OP_VARIANCE);
+ int stat_op_min = lval.sdecl().stat_ops & STAT_OP_MIN;
+ int stat_op_max = lval.sdecl().stat_ops & STAT_OP_MAX;
+ int stat_op_variance = lval.sdecl().stat_ops & STAT_OP_VARIANCE;
+
assert(lval.type() == pe_stats);
assert(rval.type() == pe_long);
assert(res.type() == pe_long);
- o->newline() << "_stp_stat_add (" << lval << ", " << rval << ");";
+
+ o->newline() << "_stp_stat_add (" << lval << ", " << rval << ", " <<
+ stat_op_count << ", " << stat_op_sum << ", " <<
+ stat_op_min << ", " << stat_op_max << ", " <<
+ stat_op_variance << ");";
res = rval;
}
else if (res.type() == pe_long)