From: Martin Cermak Date: Wed, 5 Oct 2016 06:59:42 +0000 (+0200) Subject: Introduce runtime optimizations for statistical computations per PR10234. X-Git-Tag: release-3.1~584 X-Git-Url: https://sourceware.org/git/?a=commitdiff_plain;h=26382d613f4d266f74ddb3d3f3f36aceab171e14;p=systemtap.git Introduce runtime optimizations for statistical computations per PR10234. This update improves the performance of systemtap runtime statistical computations by optimizing out unneeded parts of the __stp_stat_add() function. It is based on parametrizing and inlining it. The stap translator now generates _stp_stat_add(), or _stp_pmap_add_*() calls that have additional "optimization" parameters respective to stats in use for given global. GCC uses this for optimizing the inlined __stp_stat_add() calls. The optimization effect significantly depends on compiler version, platform architecture, and the stat operators being used for given global. At the moment, the available stat operators are @count, @sum, @min, @max, @avg, and @variance. The most computionally expensive is @variance. The effect of optimizing @variance is significant. Other stat operators are computionally chap and so the effect of their optimizations is relatively low. Using gcc-6.2.1-1.fc26.x86_64, the @count, @sum, @min, and @max optimizations brings approximately 8% run time shrinkage. The @variance optimization shrinkage is up to 70% using this compiler. For other architectures, namely for power, the optimization is less effective. runtime/map-gen.c: Pass the additional optimization parameters through the map API generator macros. runtime/map.c: Modify _new_map_set_stat() to accept optimization additional parameters. runtime/map.h: Ditto. runtime/pmap-gen.c: Pass the additional optimization parameters to __stp_map_set*() and to _stp_pmap_add*(). runtime/stat-common.c: Add optimization params to __stp_stat_add(). runtime/stat.c: Add optimization params to _stp_stat_add(). tapsets.cxx: Generate parametrized calls to the runtime. translate.cxx: Ditto. testsuite/systemtap.base/optim_stats*: New testcase. --- diff --git a/runtime/map-gen.c b/runtime/map-gen.c index c185a1a3d..413e20edb 100644 --- a/runtime/map-gen.c +++ b/runtime/map-gen.c @@ -48,8 +48,8 @@ #define VALN s #define VALSTOR char value[MAP_STRING_LENGTH] #define MAP_GET_VAL(node) ((node)->value) -#define MAP_SET_VAL(map,node,val,add) _new_map_set_str(map,MAP_GET_VAL(node),val,add) -#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add) +#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_str(map,MAP_GET_VAL(node),val,add) +#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add,0,0,0,0,0) #define NULLRET "" #elif VALUE_TYPE == INT64 #define VALTYPE int64_t @@ -58,8 +58,8 @@ #define VALN i #define VALSTOR int64_t value #define MAP_GET_VAL(node) ((node)->value) -#define MAP_SET_VAL(map,node,val,add) _new_map_set_int64(map,&MAP_GET_VAL(node),val,add) -#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add) +#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_int64(map,&MAP_GET_VAL(node),val,add) +#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add,0,0,0,0,0) #define NULLRET (int64_t)0 #elif VALUE_TYPE == STAT #define VALTYPE stat_data* @@ -68,7 +68,7 @@ #define VALN x #define VALSTOR stat_data value #define MAP_GET_VAL(node) (&(node)->value) -#define MAP_SET_VAL(map,node,val,add) _new_map_set_stat(map,MAP_GET_VAL(node),val,add) +#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_stat(map,MAP_GET_VAL(node),val,add,s1,s2,s3,s4,s5) #define MAP_COPY_VAL(map,node,val,add) _new_map_copy_stat(map,MAP_GET_VAL(node),val,add) #define NULLRET (stat_data*)0 #else @@ -799,7 +799,7 @@ static MAP KEYSYM(_stp_map_new) (int first_arg, ...) #endif /* VALUE_TYPE */ -static int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add) +static inline int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add, int s1, int s2, int s3, int s4, int s5) { unsigned int hv; struct mhlist_head *head; @@ -817,7 +817,7 @@ static int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add) mhlist_for_each_entry(n, e, head, node.hnode) { if (KEY_EQ_P(n)) { - return MAP_SET_VAL(map, n, val, add); + return MAP_SET_VAL(map, n, val, add, s1, s2, s3, s4, s5); } } /* key not found */ @@ -825,17 +825,17 @@ static int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add) if (n == NULL) return -1; KEYCPY(n); - return MAP_SET_VAL(map, n, val, 0); + return MAP_SET_VAL(map, n, val, 0, s1, s2, s3, s4, s5); } static int KEYSYM(_stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val) { - return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 0); + return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 0, 1, 1, 1, 1, 1); } static int KEYSYM(_stp_map_add) (MAP map, ALLKEYSD(key), VSTYPE val) { - return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 1); + return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 1, 1, 1, 1, 1, 1); } diff --git a/runtime/map.c b/runtime/map.c index b95c8c2d9..5caf739da 100644 --- a/runtime/map.c +++ b/runtime/map.c @@ -436,7 +436,7 @@ static int _new_map_set_str (MAP map, char *dst, char *val, int add) return 0; } -static int _new_map_set_stat (MAP map, struct stat_data *sd, int64_t val, int add) +static int _new_map_set_stat (MAP map, struct stat_data *sd, int64_t val, int add, int s1, int s2, int s3, int s4, int s5) { if (!add) { Hist st = &map->hist; @@ -449,7 +449,7 @@ static int _new_map_set_stat (MAP map, struct stat_data *sd, int64_t val, int ad } (&map->hist)->bit_shift = map->bit_shift; (&map->hist)->stat_ops = map->stat_ops; - __stp_stat_add (&map->hist, sd, val); + __stp_stat_add (&map->hist, sd, val, s1, s2, s3, s4, s5); return 0; } diff --git a/runtime/map.h b/runtime/map.h index cc4bf0711..0a1dfff5f 100644 --- a/runtime/map.h +++ b/runtime/map.h @@ -179,7 +179,7 @@ static void _stp_pmap_del(PMAP pmap); static MAP _stp_pmap_agg (PMAP pmap, map_update_fn update, map_cmp_fn cmp); static struct map_node *_stp_new_agg(MAP agg, struct mhlist_head *ahead, struct map_node *ptr, map_update_fn update); -static int _new_map_set_stat (MAP map, struct stat_data *dst, int64_t val, int add); +static int _new_map_set_stat (MAP map, struct stat_data *dst, int64_t val, int add, int s1, int s2, int s3, int s4, int s5); static int _new_map_copy_stat (MAP map, struct stat_data *dst, struct stat_data *src, int add); static void _stp_map_sort (MAP map, int keynum, int dir, map_get_key_fn get_key); static void _stp_map_sortn(MAP map, int n, int keynum, int dir, map_get_key_fn get_key); diff --git a/runtime/pmap-gen.c b/runtime/pmap-gen.c index 717220595..fbe5c5851 100644 --- a/runtime/pmap-gen.c +++ b/runtime/pmap-gen.c @@ -234,18 +234,18 @@ static int KEYSYM(_stp_pmap_set) (PMAP pmap, ALLKEYSD(key), VSTYPE val) { int res; MAP m = _stp_pmap_get_map (pmap, MAP_GET_CPU()); - res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 0); + res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 0, 1, 1, 1, 1, 1); MAP_PUT_CPU(); return res; } -static int KEYSYM(_stp_pmap_add) (PMAP pmap, ALLKEYSD(key), VSTYPE val) +static inline int KEYSYM(_stp_pmap_add) (PMAP pmap, ALLKEYSD(key), VSTYPE val, int s1, int s2, int s3, int s4, int s5) { int res; MAP m = _stp_pmap_get_map (pmap, MAP_GET_CPU()); m->bit_shift = pmap->bit_shift; m->stat_ops = pmap->stat_ops; - res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 1); + res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 1, s1, s2, s3, s4, s5); MAP_PUT_CPU(); return res; } diff --git a/runtime/stat-common.c b/runtime/stat-common.c index e58b1c232..764d84cc7 100644 --- a/runtime/stat-common.c +++ b/runtime/stat-common.c @@ -288,15 +288,13 @@ static void _stp_stat_print_histogram(Hist st, stat_data *sd) _stp_print_flush(); } -static void __stp_stat_add(Hist st, stat_data *sd, int64_t val) +static inline void __stp_stat_add(Hist st, stat_data *sd, int64_t val, + int stat_op_count, int stat_op_sum, int stat_op_min, + int stat_op_max, int stat_op_variance) { int n; int delta = 0; - /* - * Below, we use Welford's online algorithm for computing variance. - * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - */ sd->shift = st->bit_shift; sd->stat_ops = st->stat_ops; if (sd->count == 0) { @@ -305,20 +303,19 @@ static void __stp_stat_add(Hist st, stat_data *sd, int64_t val) sd->avg_s = val << sd->shift; sd->_M2 = 0; } else { - sd->count++; - sd->sum += val; - if (val > sd->max) + if(stat_op_count) + sd->count++; + if(stat_op_sum) + sd->sum += val; + if (stat_op_min && (val > sd->max)) sd->max = val; - if (val < sd->min) + if (stat_op_max && (val < sd->min)) sd->min = val; /* - * Following is an optimization that improves performance - * in case @variance() isn't used with given global. - * - * Note that this doesn't affect computing of @avg(), which - * happens within the per-CPU aggregation functions. + * Below, we use Welford's online algorithm for computing variance. + * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance */ - if (sd->stat_ops & STAT_OP_VARIANCE) { + if (stat_op_variance) { delta = (val << sd->shift) - sd->avg_s; sd->avg_s += _stp_div64(NULL, delta, sd->count); sd->_M2 += delta * ((val << sd->shift) - sd->avg_s); diff --git a/runtime/stat.c b/runtime/stat.c index 62ef538dc..7fc8ac525 100644 --- a/runtime/stat.c +++ b/runtime/stat.c @@ -130,21 +130,32 @@ static void _stp_stat_del (Stat st) } /** Add to a Stat. - * Add an int64 to a Stat. + * Add an int64 to a Stat, and for optimization purposes specify which + * statistical operators are bound to given Stat. Set all of stat_op* + * to 1 if unsure. Note that @avg() is being evaluated separately based + * on @sum and @count within the code directly generated by the translator. * * @param st Stat * @param val Value to add + * @param stat_op_count int + * @param stat_op_sum int + * @param stat_op_min int + * @param stat_op_max int + * @param stat_op_variance int + * */ -static void _stp_stat_add (Stat st, int64_t val) +static inline void _stp_stat_add (Stat st, int64_t val, int stat_op_count, + int stat_op_sum, int stat_op_min, + int stat_op_max, int stat_op_variance) { stat_data *sd = _stp_stat_per_cpu_ptr (st, STAT_GET_CPU()); STAT_LOCK(sd); - __stp_stat_add (&st->hist, sd, val); + __stp_stat_add (&st->hist, sd, val, stat_op_count, stat_op_sum, + stat_op_min, stat_op_max, stat_op_variance); STAT_UNLOCK(sd); STAT_PUT_CPU(); } - static void _stp_stat_clear_data (Stat st, stat_data *sd) { int j; diff --git a/tapsets.cxx b/tapsets.cxx index 016dd3aba..164881f44 100644 --- a/tapsets.cxx +++ b/tapsets.cxx @@ -269,7 +269,8 @@ common_probe_entryfn_epilogue (systemtap_session& s, } s.op->newline() << "#ifdef STP_TIMING"; - s.op->newline() << "if (likely (stat)) _stp_stat_add(stat, cycles_elapsed);"; + // STP_TIMING requires min, max, avg (and thus count and sum), but not variance. + s.op->newline() << "if (likely (stat)) _stp_stat_add(stat, cycles_elapsed, 1, 1, 1, 1, 0);"; s.op->newline() << "#endif"; if (overload_processing && !s.runtime_usermode_p()) diff --git a/testsuite/systemtap.base/optim_stats.exp b/testsuite/systemtap.base/optim_stats.exp new file mode 100644 index 000000000..e46de40d3 --- /dev/null +++ b/testsuite/systemtap.base/optim_stats.exp @@ -0,0 +1,31 @@ +# This is a test for stat run time optimizations. +# See corresponding .stp file for details. + +set test "optim_stats" + +if {![installtest_p]} { + untested $test + return +} + +for {set i 1} {$i <= 2} {incr i} { + foreach runtime [get_runtime_list] { + if {$runtime != ""} { + spawn stap --runtime=$runtime -g --suppress-time-limits $srcdir/$subdir/$test$i.stp + } else { + spawn stap -g --suppress-time-limits $srcdir/$subdir/$test$i.stp + } + + expect { + -timeout 300 + -re {^IGNORE[^\r\n]+\r\n} { exp_continue } + -re {^PASS test1[^\r\n]+\r\n} { pass "$test$i.stp subtest1 $runtime"; exp_continue } + -re {^PASS test2[^\r\n]+\r\n} { pass "$test$i.stp subtest2 $runtime"; exp_continue } + -re {^FAIL test1[^\r\n]+\r\n} { fail "$test$i.stp subtest1 $runtime"; exp_continue } + -re {^FAIL test2[^\r\n]+\r\n} { fail "$test$i.stp subtest2 $runtime"; exp_continue } + timeout {fail "$test: unexpected timeout"} + eof { } + } + catch {close}; catch {wait} + } +} diff --git a/testsuite/systemtap.base/optim_stats1.stp b/testsuite/systemtap.base/optim_stats1.stp new file mode 100644 index 000000000..2144b7bb2 --- /dev/null +++ b/testsuite/systemtap.base/optim_stats1.stp @@ -0,0 +1,136 @@ +/* + * This is a test for stat run time optimizations. Each stat has a list of + * requested statistical operators. For instance, if a script uses stat x, + * and only refers to @avg(x), then the list of requested statistical operators + * for given stat x is @count, @sum, and @avg. The @min(x) and @max(x) are + * not in the list, and thus do not need to be avaluated at the _stp_stat_add() + * time (iow, at the x<<= treshold) ? "PASS" : "FAIL"), shrinkage) + + + /* TEST 2: test optimizations for @variance. */ + + for (i=0; i<(@ITERS / 4); i++) + { + + for (j=0; j<@RANDCNT; j++) + randvals[j] = randint(@RANDMAX) + + if(i%2) + { + @feed(a, ta) + @feed(b, tb) + } + else + { + @feed(b, tb) + @feed(a, ta) + } + } + + printdln(" ", "IGNORE", @count(a)) + printdln(" ", "IGNORE", @variance(b)) + + shrinkage = (tb-ta)*100/tb + + /* + * Treshold [%], for this test the usual value is around 68% at the time + * of writing this test. + */ + treshold = 20 + + printf("%s test2 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage) + + exit() +} diff --git a/testsuite/systemtap.base/optim_stats2.stp b/testsuite/systemtap.base/optim_stats2.stp new file mode 100644 index 000000000..53bbc6914 --- /dev/null +++ b/testsuite/systemtap.base/optim_stats2.stp @@ -0,0 +1,85 @@ +/* + * Analogy to optim_stats1.stp, but for pmaps. See optim_stats1.stp for comments. + */ + +@define RANDCNT %( 200000 %) +@define RANDMAX %( 1000 %) +@define ITERS %( 1500 %) + +@define feed(agg, tagg) +%( + t = time() + foreach(k in randvals) + @agg <<< k + @tagg += time() - t +%) + +global x, tx = 0, y, ty = 0 +global a, ta = 0, b, tb = 0 +global randvals[@RANDCNT] + +function time() { return gettimeofday_us() } + +probe begin +{ + /* TEST 1 */ + + for (i=0; i<@ITERS; i++) + { + + for (j=0; j<@RANDCNT; j++) + randvals[j] = randint(@RANDMAX) + + if(i%2) + { + @feed(x[1], tx) + @feed(y[1], ty) + } + else + { + @feed(y[1], ty) + @feed(x[1], tx) + } + } + + printdln(" ", "IGNORE", @count(x[1])) + printdln(" ", "IGNORE", @count(y[1]), @sum(y[1]), @min(y[1]), @max(y[1])) + + shrinkage = (ty-tx)*100/ty + + treshold = 0 + + printf("%s test1 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage) + + + /* TEST 2 */ + + for (i=0; i<(@ITERS / 4); i++) + { + + for (j=0; j<@RANDCNT; j++) + randvals[j] = randint(@RANDMAX) + + if(i%2) + { + @feed(a[1], ta) + @feed(b[1], tb) + } + else + { + @feed(b[1], tb) + @feed(a[1], ta) + } + } + + printdln(" ", "IGNORE", @count(a[1])) + printdln(" ", "IGNORE", @variance(b[1])) + + shrinkage = (tb-ta)*100/tb + + treshold = 20 + + printf("%s test2 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage) + + exit() +} diff --git a/translate.cxx b/translate.cxx index 73e0f0a2e..f4c833226 100644 --- a/translate.cxx +++ b/translate.cxx @@ -742,6 +742,17 @@ struct mapvar return result; } + string stat_op_parms() const + { + string result = ""; + result += (sd.stat_ops & (STAT_OP_COUNT|STAT_OP_AVG|STAT_OP_VARIANCE)) ? "1, " : "0, "; + result += (sd.stat_ops & (STAT_OP_SUM|STAT_OP_AVG|STAT_OP_VARIANCE)) ? "1, " : "0, "; + result += (sd.stat_ops & STAT_OP_MIN) ? "1, " : "0, "; + result += (sd.stat_ops & STAT_OP_MAX) ? "1, " : "0, "; + result += (sd.stat_ops & STAT_OP_VARIANCE) ? "1" : "0"; + return result; + } + string calculate_aggregate() const { if (!is_parallel()) @@ -793,7 +804,7 @@ struct mapvar // impedance matching: empty strings -> NULL if (type() == pe_stats) - res += (call_prefix("add", indices) + ", " + val.value() + ")"); + res += (call_prefix("add", indices) + ", " + val.value() + ", " + stat_op_parms() + ")"); else throw SEMANTIC_ERROR(_("adding a value of an unsupported map type")); @@ -2128,7 +2139,8 @@ c_unparser::emit_module_refresh () o->newline(1) << "? ((int32_t)cycles_atend - (int32_t)cycles_atstart)"; o->newline() << ": (~(int32_t)0) - (int32_t)cycles_atstart + (int32_t)cycles_atend + 1;"; o->indent(-1); - o->newline() << "_stp_stat_add(g_refresh_timing, cycles_elapsed);"; + // STP_TIMING requires min, max, avg (and thus count and sum), but not variance. + o->newline() << "_stp_stat_add(g_refresh_timing, cycles_elapsed, 1, 1, 1, 1, 0);"; o->newline(-1) << "}"; o->newline() << "#endif"; } @@ -3390,10 +3402,20 @@ c_unparser_assignment::c_assignop(tmpvar & res, } else if (op == "<<<") { + int stat_op_count = lval.sdecl().stat_ops & (STAT_OP_COUNT|STAT_OP_AVG|STAT_OP_VARIANCE); + int stat_op_sum = lval.sdecl().stat_ops & (STAT_OP_SUM|STAT_OP_AVG|STAT_OP_VARIANCE); + int stat_op_min = lval.sdecl().stat_ops & STAT_OP_MIN; + int stat_op_max = lval.sdecl().stat_ops & STAT_OP_MAX; + int stat_op_variance = lval.sdecl().stat_ops & STAT_OP_VARIANCE; + assert(lval.type() == pe_stats); assert(rval.type() == pe_long); assert(res.type() == pe_long); - o->newline() << "_stp_stat_add (" << lval << ", " << rval << ");"; + + o->newline() << "_stp_stat_add (" << lval << ", " << rval << ", " << + stat_op_count << ", " << stat_op_sum << ", " << + stat_op_min << ", " << stat_op_max << ", " << + stat_op_variance << ");"; res = rval; } else if (res.type() == pe_long)