From: Martin Cermak <mcermak@redhat.com>
Date: Wed, 5 Oct 2016 06:59:42 +0000 (+0200)
Subject: Introduce runtime optimizations for statistical computations per PR10234.
X-Git-Tag: release-3.1~584
X-Git-Url: https://sourceware.org/git/?a=commitdiff_plain;h=26382d613f4d266f74ddb3d3f3f36aceab171e14;p=systemtap.git

Introduce runtime optimizations for statistical computations per PR10234.

This update improves the performance of systemtap runtime statistical
computations by optimizing out unneeded parts of the __stp_stat_add()
function.  It is based on parametrizing and inlining it.  The stap
translator now generates _stp_stat_add(), or _stp_pmap_add_*() calls
that have additional "optimization" parameters respective to stats
in use for given global.  GCC uses this for optimizing the inlined
__stp_stat_add() calls.

The optimization effect significantly depends on compiler version,
platform architecture, and the stat operators being used for given
global.  At the moment, the available stat operators are @count,
@sum, @min, @max, @avg, and @variance. The most computionally
expensive is @variance.  The effect of optimizing @variance is
significant.  Other stat operators are computionally chap and so
the effect of their optimizations is relatively low.

Using gcc-6.2.1-1.fc26.x86_64, the @count, @sum, @min, and @max
optimizations brings approximately 8% run time shrinkage. The
@variance optimization shrinkage is up to 70% using this compiler.
For other architectures, namely for power, the optimization is less
effective.

runtime/map-gen.c: Pass the additional optimization parameters through
	the map API generator macros.
runtime/map.c: Modify _new_map_set_stat() to accept optimization
        additional parameters.
runtime/map.h: Ditto.
runtime/pmap-gen.c: Pass the additional optimization parameters to
	__stp_map_set*() and to _stp_pmap_add*().
runtime/stat-common.c: Add optimization params to __stp_stat_add().
runtime/stat.c: Add optimization params to _stp_stat_add().
tapsets.cxx: Generate parametrized calls to the runtime.
translate.cxx: Ditto.
testsuite/systemtap.base/optim_stats*: New testcase.
---

diff --git a/runtime/map-gen.c b/runtime/map-gen.c
index c185a1a3d..413e20edb 100644
--- a/runtime/map-gen.c
+++ b/runtime/map-gen.c
@@ -48,8 +48,8 @@
 #define VALN s
 #define VALSTOR char value[MAP_STRING_LENGTH]
 #define MAP_GET_VAL(node) ((node)->value)
-#define MAP_SET_VAL(map,node,val,add) _new_map_set_str(map,MAP_GET_VAL(node),val,add)
-#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add)
+#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_str(map,MAP_GET_VAL(node),val,add)
+#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add,0,0,0,0,0)
 #define NULLRET ""
 #elif VALUE_TYPE == INT64
 #define VALTYPE int64_t
@@ -58,8 +58,8 @@
 #define VALN i
 #define VALSTOR int64_t value
 #define MAP_GET_VAL(node) ((node)->value)
-#define MAP_SET_VAL(map,node,val,add) _new_map_set_int64(map,&MAP_GET_VAL(node),val,add)
-#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add)
+#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_int64(map,&MAP_GET_VAL(node),val,add)
+#define MAP_COPY_VAL(map,node,val,add) MAP_SET_VAL(map,node,val,add,0,0,0,0,0)
 #define NULLRET (int64_t)0
 #elif VALUE_TYPE == STAT
 #define VALTYPE stat_data*
@@ -68,7 +68,7 @@
 #define VALN x
 #define VALSTOR stat_data value
 #define MAP_GET_VAL(node) (&(node)->value)
-#define MAP_SET_VAL(map,node,val,add) _new_map_set_stat(map,MAP_GET_VAL(node),val,add)
+#define MAP_SET_VAL(map,node,val,add,s1,s2,s3,s4,s5) _new_map_set_stat(map,MAP_GET_VAL(node),val,add,s1,s2,s3,s4,s5)
 #define MAP_COPY_VAL(map,node,val,add) _new_map_copy_stat(map,MAP_GET_VAL(node),val,add)
 #define NULLRET (stat_data*)0
 #else
@@ -799,7 +799,7 @@ static MAP KEYSYM(_stp_map_new) (int first_arg, ...)
 
 #endif /* VALUE_TYPE */
 
-static int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add)
+static inline int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add, int s1, int s2, int s3, int s4, int s5)
 {
 	unsigned int hv;
 	struct mhlist_head *head;
@@ -817,7 +817,7 @@ static int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add)
 
 	mhlist_for_each_entry(n, e, head, node.hnode) {
 		if (KEY_EQ_P(n)) {
-			return MAP_SET_VAL(map, n, val, add);
+			return MAP_SET_VAL(map, n, val, add, s1, s2, s3, s4, s5);
 		}
 	}
 	/* key not found */
@@ -825,17 +825,17 @@ static int KEYSYM(__stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val, int add)
 	if (n == NULL)
 		return -1;
 	KEYCPY(n);
-	return MAP_SET_VAL(map, n, val, 0);
+	return MAP_SET_VAL(map, n, val, 0, s1, s2, s3, s4, s5);
 }
 
 static int KEYSYM(_stp_map_set) (MAP map, ALLKEYSD(key), VSTYPE val)
 {
-	return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 0);
+	return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 0, 1, 1, 1, 1, 1);
 }
 
 static int KEYSYM(_stp_map_add) (MAP map, ALLKEYSD(key), VSTYPE val)
 {
-	return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 1);
+	return KEYSYM(__stp_map_set) (map, ALLKEYS(key), val, 1, 1, 1, 1, 1, 1);
 }
 
 
diff --git a/runtime/map.c b/runtime/map.c
index b95c8c2d9..5caf739da 100644
--- a/runtime/map.c
+++ b/runtime/map.c
@@ -436,7 +436,7 @@ static int _new_map_set_str (MAP map, char *dst, char *val, int add)
 	return 0;
 }
 
-static int _new_map_set_stat (MAP map, struct stat_data *sd, int64_t val, int add)
+static int _new_map_set_stat (MAP map, struct stat_data *sd, int64_t val, int add, int s1, int s2, int s3, int s4, int s5)
 {
 	if (!add) {
 		Hist st = &map->hist;
@@ -449,7 +449,7 @@ static int _new_map_set_stat (MAP map, struct stat_data *sd, int64_t val, int ad
 	}
 	(&map->hist)->bit_shift = map->bit_shift;
 	(&map->hist)->stat_ops = map->stat_ops;
-	__stp_stat_add (&map->hist, sd, val);
+	__stp_stat_add (&map->hist, sd, val, s1, s2, s3, s4, s5);
 	return 0;
 }
 
diff --git a/runtime/map.h b/runtime/map.h
index cc4bf0711..0a1dfff5f 100644
--- a/runtime/map.h
+++ b/runtime/map.h
@@ -179,7 +179,7 @@ static void _stp_pmap_del(PMAP pmap);
 static MAP _stp_pmap_agg (PMAP pmap, map_update_fn update, map_cmp_fn cmp);
 static struct map_node *_stp_new_agg(MAP agg, struct mhlist_head *ahead,
 				     struct map_node *ptr, map_update_fn update);
-static int _new_map_set_stat (MAP map, struct stat_data *dst, int64_t val, int add);
+static int _new_map_set_stat (MAP map, struct stat_data *dst, int64_t val, int add, int s1, int s2, int s3, int s4, int s5);
 static int _new_map_copy_stat (MAP map, struct stat_data *dst, struct stat_data *src, int add);
 static void _stp_map_sort (MAP map, int keynum, int dir, map_get_key_fn get_key);
 static void _stp_map_sortn(MAP map, int n, int keynum, int dir, map_get_key_fn get_key);
diff --git a/runtime/pmap-gen.c b/runtime/pmap-gen.c
index 717220595..fbe5c5851 100644
--- a/runtime/pmap-gen.c
+++ b/runtime/pmap-gen.c
@@ -234,18 +234,18 @@ static int KEYSYM(_stp_pmap_set) (PMAP pmap, ALLKEYSD(key), VSTYPE val)
 {
 	int res;
 	MAP m = _stp_pmap_get_map (pmap, MAP_GET_CPU());
-	res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 0);
+	res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 0, 1, 1, 1, 1, 1);
         MAP_PUT_CPU();
 	return res;
 }
 
-static int KEYSYM(_stp_pmap_add) (PMAP pmap, ALLKEYSD(key), VSTYPE val)
+static inline int KEYSYM(_stp_pmap_add) (PMAP pmap, ALLKEYSD(key), VSTYPE val, int s1, int s2, int s3, int s4, int s5)
 {
 	int res;
 	MAP m = _stp_pmap_get_map (pmap, MAP_GET_CPU());
 	m->bit_shift = pmap->bit_shift;
 	m->stat_ops = pmap->stat_ops;
-	res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 1);
+	res = KEYSYM(__stp_map_set) (m, ALLKEYS(key), val, 1, s1, s2, s3, s4, s5);
         MAP_PUT_CPU();
 	return res;
 }
diff --git a/runtime/stat-common.c b/runtime/stat-common.c
index e58b1c232..764d84cc7 100644
--- a/runtime/stat-common.c
+++ b/runtime/stat-common.c
@@ -288,15 +288,13 @@ static void _stp_stat_print_histogram(Hist st, stat_data *sd)
 	_stp_print_flush();
 }
 
-static void __stp_stat_add(Hist st, stat_data *sd, int64_t val)
+static inline void __stp_stat_add(Hist st, stat_data *sd, int64_t val,
+                                  int stat_op_count, int stat_op_sum, int stat_op_min,
+				  int stat_op_max, int stat_op_variance)
 {
 	int n;
 	int delta = 0;
 
-	/*
-	 * Below, we use Welford's online algorithm for computing variance.
-	 * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-	 */
 	sd->shift = st->bit_shift;
 	sd->stat_ops = st->stat_ops;
 	if (sd->count == 0) {
@@ -305,20 +303,19 @@ static void __stp_stat_add(Hist st, stat_data *sd, int64_t val)
 		sd->avg_s = val << sd->shift;
 		sd->_M2 = 0;
 	} else {
-		sd->count++;
-		sd->sum += val;
-		if (val > sd->max)
+		if(stat_op_count)
+			sd->count++;
+		if(stat_op_sum)
+			sd->sum += val;
+		if (stat_op_min && (val > sd->max))
 			sd->max = val;
-		if (val < sd->min)
+		if (stat_op_max && (val < sd->min))
 			sd->min = val;
 		/*
-		 * Following is an optimization that improves performance
-		 * in case @variance() isn't used with given global.
-		 *
-		 * Note that this doesn't affect computing of @avg(), which
-		 * happens within the per-CPU aggregation functions.
+		 * Below, we use Welford's online algorithm for computing variance.
+		 * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 		 */
-		if (sd->stat_ops & STAT_OP_VARIANCE) {
+		if (stat_op_variance) {
 		    delta = (val << sd->shift) - sd->avg_s;
 		    sd->avg_s += _stp_div64(NULL, delta, sd->count);
 		    sd->_M2 += delta * ((val << sd->shift) - sd->avg_s);
diff --git a/runtime/stat.c b/runtime/stat.c
index 62ef538dc..7fc8ac525 100644
--- a/runtime/stat.c
+++ b/runtime/stat.c
@@ -130,21 +130,32 @@ static void _stp_stat_del (Stat st)
 }
 
 /** Add to a Stat.
- * Add an int64 to a Stat.
+ * Add an int64 to a Stat, and for optimization purposes specify which
+ * statistical operators are bound to given Stat.  Set all of stat_op*
+ * to 1 if unsure.  Note that @avg() is being evaluated separately based
+ * on @sum and @count within the code directly generated by the translator.
  *
  * @param st Stat
  * @param val Value to add
+ * @param stat_op_count int
+ * @param stat_op_sum int
+ * @param stat_op_min int
+ * @param stat_op_max int
+ * @param stat_op_variance int
+ *
  */
-static void _stp_stat_add (Stat st, int64_t val)
+static inline void _stp_stat_add (Stat st, int64_t val, int stat_op_count,
+                                  int stat_op_sum, int stat_op_min,
+				  int stat_op_max, int stat_op_variance)
 {
 	stat_data *sd = _stp_stat_per_cpu_ptr (st, STAT_GET_CPU());
 	STAT_LOCK(sd);
-	__stp_stat_add (&st->hist, sd, val);
+	__stp_stat_add (&st->hist, sd, val, stat_op_count, stat_op_sum,
+	                stat_op_min, stat_op_max, stat_op_variance);
 	STAT_UNLOCK(sd);
 	STAT_PUT_CPU();
 }
 
-
 static void _stp_stat_clear_data (Stat st, stat_data *sd)
 {
         int j;
diff --git a/tapsets.cxx b/tapsets.cxx
index 016dd3aba..164881f44 100644
--- a/tapsets.cxx
+++ b/tapsets.cxx
@@ -269,7 +269,8 @@ common_probe_entryfn_epilogue (systemtap_session& s,
     }
 
   s.op->newline() << "#ifdef STP_TIMING";
-  s.op->newline() << "if (likely (stat)) _stp_stat_add(stat, cycles_elapsed);";
+  // STP_TIMING requires min, max, avg (and thus count and sum), but not variance.
+  s.op->newline() << "if (likely (stat)) _stp_stat_add(stat, cycles_elapsed, 1, 1, 1, 1, 0);";
   s.op->newline() << "#endif";
 
   if (overload_processing && !s.runtime_usermode_p())
diff --git a/testsuite/systemtap.base/optim_stats.exp b/testsuite/systemtap.base/optim_stats.exp
new file mode 100644
index 000000000..e46de40d3
--- /dev/null
+++ b/testsuite/systemtap.base/optim_stats.exp
@@ -0,0 +1,31 @@
+# This is a test for stat run time optimizations.
+# See corresponding .stp file for details.
+
+set test "optim_stats"
+
+if {![installtest_p]} {
+    untested $test
+    return
+}
+
+for {set i 1} {$i <= 2} {incr i} {
+    foreach runtime [get_runtime_list] {
+	if {$runtime != ""} {
+	    spawn stap --runtime=$runtime -g --suppress-time-limits $srcdir/$subdir/$test$i.stp
+	} else {
+	    spawn stap -g --suppress-time-limits $srcdir/$subdir/$test$i.stp
+	}
+
+	expect {
+	    -timeout 300
+	    -re {^IGNORE[^\r\n]+\r\n} { exp_continue }
+	    -re {^PASS test1[^\r\n]+\r\n} { pass "$test$i.stp subtest1 $runtime"; exp_continue }
+	    -re {^PASS test2[^\r\n]+\r\n} { pass "$test$i.stp subtest2 $runtime"; exp_continue }
+	    -re {^FAIL test1[^\r\n]+\r\n} { fail "$test$i.stp subtest1 $runtime"; exp_continue }
+	    -re {^FAIL test2[^\r\n]+\r\n} { fail "$test$i.stp subtest2 $runtime"; exp_continue }
+	    timeout {fail "$test: unexpected timeout"}
+	    eof { }
+	}
+	catch {close}; catch {wait}
+    }
+}
diff --git a/testsuite/systemtap.base/optim_stats1.stp b/testsuite/systemtap.base/optim_stats1.stp
new file mode 100644
index 000000000..2144b7bb2
--- /dev/null
+++ b/testsuite/systemtap.base/optim_stats1.stp
@@ -0,0 +1,136 @@
+/*
+ * This is a test for stat run time optimizations.  Each stat has a list of
+ * requested statistical operators.  For instance, if a script uses stat x,
+ * and only refers to @avg(x), then the list of requested statistical operators
+ * for given stat x is @count, @sum, and @avg. The  @min(x) and @max(x) are
+ * not in the list, and thus do not need to be avaluated at the _stp_stat_add()
+ * time (iow, at the x<<<val time).  Optimization based on this makes the
+ * systemtap runtime run faster. The goal of this test is to verify that this
+ * sort of optimizations actually works in a measurable way.
+ *
+ * At the moment, the available stat operators are @count, @sum, @min, @max,
+ * @avg, and @variance.  The most computionally expensive is @variance.
+ * Detecting the variance optimization is quite simple.  Other operators are
+ * computionally cheap and thus detecting their respective optimizations is
+ * somewhat tricky on a multiuser/multitasking system, where so many irrelevant
+ * bearings are affecting our fragile measurement.  In this case we must set
+ * the treshold distinguishing between the PASS and FAIL pretty carefully.  Just
+ * slightly above the "noise".  This testcase is sentenced to be fragile by it's
+ * nature though.
+ *
+ * One of the basic assumptions for this sort of test is that if we compare stats
+ * having identical list of requested statistical operators, we should get very
+ * similar results.  It turns out, that to achieve this, we can't simply feed the
+ * values into measured stats in straightforward order. Instead, we need to baffle
+ * the optimizations under the hood by complicating the "feed" order slightly.
+ * After verifying this assumption, we can start comparing different stats.
+ *
+ * Since verifying the @variance optimization is much easirer and doesn't require
+ * so many time consuming iterations to get reasonable results, this test is
+ * divided into two parts, TEST 1, and TEST 2, where in TEST 1 we focus on the
+ * optimization for @count, @sum, @min, and @max, and then, in TEST 2, we test the
+ * @variance optimization separately. This makes the test itself run faster.
+ *
+ */
+
+@define RANDCNT %( 200000 %)
+@define RANDMAX %( 1000 %)
+@define ITERS %( 1500 %)
+
+@define feed(agg, tagg)
+%(
+    t = time()
+    foreach(k in randvals)
+	@agg <<< k
+    @tagg += time() - t
+%)
+
+global x, tx = 0, y, ty = 0
+global a, ta = 0, b, tb = 0
+global randvals[@RANDCNT]
+
+function time() { return gettimeofday_us() }
+
+probe begin
+{
+    /* TEST 1: test optimizations for @count, @sum, @min, and @max. */
+
+    for (i=0; i<@ITERS; i++)
+    {
+
+	for (j=0; j<@RANDCNT; j++)
+	    randvals[j] = randint(@RANDMAX)
+
+	/* The "ordering dance" described above happens here */
+	if(i%2)
+	{
+	    @feed(x, tx)
+	    @feed(y, ty)
+	}
+	else
+	{
+	    @feed(y, ty)
+	    @feed(x, tx)
+	}
+    }
+
+    /*
+     * We need to print the stats out to avoid compiler elision.
+     * The list of stats mentioned below makes the actual difference
+     * between stats under test and is the gist of this test.  The test
+     * should show no measurable shrinkage, if the below list doesn't
+     * differ for measured stats.
+     */
+    printdln(" ", "IGNORE", @count(x))
+    printdln(" ", "IGNORE", @count(y), @sum(y), @min(y), @max(y))
+
+    /* Measured shrinkage [%] */
+    shrinkage = (ty-tx)*100/ty
+
+    /*
+     * Treshold [%] (just slightly above the "noise") The usual values were
+     * around 8% at the time of writing this test using gcc-6.2.1-1.fc26.x86_64.
+     * But deeper testing shows, that on other arches, namely on power and arm,
+     * gcc is not so good optimizing the runtime code, so here we only check
+     * for regressions.
+     */
+    treshold = 0
+
+    printf("%s test1 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage)
+
+
+    /* TEST 2: test optimizations for @variance. */
+
+    for (i=0; i<(@ITERS / 4); i++)
+    {
+
+	for (j=0; j<@RANDCNT; j++)
+	    randvals[j] = randint(@RANDMAX)
+
+	if(i%2)
+	{
+	    @feed(a, ta)
+	    @feed(b, tb)
+	}
+	else
+	{
+	    @feed(b, tb)
+	    @feed(a, ta)
+	}
+    }
+
+    printdln(" ", "IGNORE", @count(a))
+    printdln(" ", "IGNORE", @variance(b))
+
+    shrinkage = (tb-ta)*100/tb
+
+    /*
+     * Treshold [%], for this test the usual value is around 68% at the time
+     * of writing this test.
+     */
+    treshold = 20
+
+    printf("%s test2 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage)
+
+    exit()
+}
diff --git a/testsuite/systemtap.base/optim_stats2.stp b/testsuite/systemtap.base/optim_stats2.stp
new file mode 100644
index 000000000..53bbc6914
--- /dev/null
+++ b/testsuite/systemtap.base/optim_stats2.stp
@@ -0,0 +1,85 @@
+/*
+ * Analogy to optim_stats1.stp, but for pmaps.  See optim_stats1.stp for comments.
+ */
+
+@define RANDCNT %( 200000 %)
+@define RANDMAX %( 1000 %)
+@define ITERS %( 1500 %)
+
+@define feed(agg, tagg)
+%(
+    t = time()
+    foreach(k in randvals)
+	@agg <<< k
+    @tagg += time() - t
+%)
+
+global x, tx = 0, y, ty = 0
+global a, ta = 0, b, tb = 0
+global randvals[@RANDCNT]
+
+function time() { return gettimeofday_us() }
+
+probe begin
+{
+    /* TEST 1 */
+
+    for (i=0; i<@ITERS; i++)
+    {
+
+	for (j=0; j<@RANDCNT; j++)
+	    randvals[j] = randint(@RANDMAX)
+
+	if(i%2)
+	{
+	    @feed(x[1], tx)
+	    @feed(y[1], ty)
+	}
+	else
+	{
+	    @feed(y[1], ty)
+	    @feed(x[1], tx)
+	}
+    }
+
+    printdln(" ", "IGNORE", @count(x[1]))
+    printdln(" ", "IGNORE", @count(y[1]), @sum(y[1]), @min(y[1]), @max(y[1]))
+
+    shrinkage = (ty-tx)*100/ty
+
+    treshold = 0
+
+    printf("%s test1 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage)
+
+
+    /* TEST 2 */
+
+    for (i=0; i<(@ITERS / 4); i++)
+    {
+
+	for (j=0; j<@RANDCNT; j++)
+	    randvals[j] = randint(@RANDMAX)
+
+	if(i%2)
+	{
+	    @feed(a[1], ta)
+	    @feed(b[1], tb)
+	}
+	else
+	{
+	    @feed(b[1], tb)
+	    @feed(a[1], ta)
+	}
+    }
+
+    printdln(" ", "IGNORE", @count(a[1]))
+    printdln(" ", "IGNORE", @variance(b[1]))
+
+    shrinkage = (tb-ta)*100/tb
+
+    treshold = 20
+
+    printf("%s test2 (%d)\n", ((shrinkage >= treshold) ? "PASS" : "FAIL"), shrinkage)
+
+    exit()
+}
diff --git a/translate.cxx b/translate.cxx
index 73e0f0a2e..f4c833226 100644
--- a/translate.cxx
+++ b/translate.cxx
@@ -742,6 +742,17 @@ struct mapvar
     return result;
   }
 
+  string stat_op_parms() const
+  {
+    string result = "";
+    result += (sd.stat_ops & (STAT_OP_COUNT|STAT_OP_AVG|STAT_OP_VARIANCE)) ? "1, " : "0, ";
+    result += (sd.stat_ops & (STAT_OP_SUM|STAT_OP_AVG|STAT_OP_VARIANCE)) ? "1, " : "0, ";
+    result += (sd.stat_ops & STAT_OP_MIN) ? "1, " : "0, ";
+    result += (sd.stat_ops & STAT_OP_MAX) ? "1, " : "0, ";
+    result += (sd.stat_ops & STAT_OP_VARIANCE) ? "1" : "0";
+    return result;
+  }
+
   string calculate_aggregate() const
   {
     if (!is_parallel())
@@ -793,7 +804,7 @@ struct mapvar
 
     // impedance matching: empty strings -> NULL
     if (type() == pe_stats)
-      res += (call_prefix("add", indices) + ", " + val.value() + ")");
+      res += (call_prefix("add", indices) + ", " + val.value() + ", " + stat_op_parms() + ")");
     else
       throw SEMANTIC_ERROR(_("adding a value of an unsupported map type"));
 
@@ -2128,7 +2139,8 @@ c_unparser::emit_module_refresh ()
       o->newline(1) << "? ((int32_t)cycles_atend - (int32_t)cycles_atstart)";
       o->newline() << ": (~(int32_t)0) - (int32_t)cycles_atstart + (int32_t)cycles_atend + 1;";
       o->indent(-1);
-      o->newline() << "_stp_stat_add(g_refresh_timing, cycles_elapsed);";
+      // STP_TIMING requires min, max, avg (and thus count and sum), but not variance.
+      o->newline() << "_stp_stat_add(g_refresh_timing, cycles_elapsed, 1, 1, 1, 1, 0);";
       o->newline(-1) << "}";
       o->newline() << "#endif";
     }
@@ -3390,10 +3402,20 @@ c_unparser_assignment::c_assignop(tmpvar & res,
     }
   else if (op == "<<<")
     {
+      int stat_op_count = lval.sdecl().stat_ops & (STAT_OP_COUNT|STAT_OP_AVG|STAT_OP_VARIANCE);
+      int stat_op_sum = lval.sdecl().stat_ops & (STAT_OP_SUM|STAT_OP_AVG|STAT_OP_VARIANCE);
+      int stat_op_min = lval.sdecl().stat_ops & STAT_OP_MIN;
+      int stat_op_max = lval.sdecl().stat_ops & STAT_OP_MAX;
+      int stat_op_variance = lval.sdecl().stat_ops & STAT_OP_VARIANCE;
+
       assert(lval.type() == pe_stats);
       assert(rval.type() == pe_long);
       assert(res.type() == pe_long);
-      o->newline() << "_stp_stat_add (" << lval << ", " << rval << ");";
+
+      o->newline() << "_stp_stat_add (" << lval << ", " << rval << ", " <<
+                      stat_op_count << ", " <<  stat_op_sum << ", " <<
+                      stat_op_min << ", " << stat_op_max << ", " <<
+                      stat_op_variance << ");";
       res = rval;
     }
   else if (res.type() == pe_long)