This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Use HP_TIMING for benchmarks if available
- From: Siddhesh Poyarekar <siddhesh at redhat dot com>
- To: libc-alpha at sourceware dot org
- Date: Mon, 15 Apr 2013 13:49:36 +0530
- Subject: [PATCH] Use HP_TIMING for benchmarks if available
Hi,
Here's a patch that adds support for using HP_TIMING in benchmark
measurements when it is available. clock_gettime is still kept as a
fallback when !HP_TIMING_AVAIL. Additionally, I've also added support
to override HP_TIMING to use clock_gettime by executing:
make USE_CLOCK_GETTIME=1 bench
One would need a 'make bench-clean' to ensure that the sources are
rebuilt whenever one needs to switch between clock_gettime and
HP_TIMING. Another easy way is to just touch benchtests/Makefile.
I have verified that the measurements on x6_64 are consistent over
multiple runs, so my concern of scheduler overhead causing jitters was
unfounded. Jitter due to high system load is unavoidable even with
clock_gettime since (as Rich Felker pointed out in an earlier
discussion) competition for cache will still affect the performance
numbers.
Siddhesh
* Makeconfig (cflags): Add bench-cflags.
* benchtests/Makefile: Define bench-cflags if
USE_CLOCK_GETTIME is defined.
* benchtests-bench-skeleton.c: Include bench-timing.h.
(main): Use TIMING_* macros instead of clock_gettime.
* benchtests/bench-timing.h: New file.
diff --git a/Makeconfig b/Makeconfig
index a3d3e70..a83485f 100644
--- a/Makeconfig
+++ b/Makeconfig
@@ -755,6 +755,9 @@ ifeq "$(strip $(+cflags))" ""
+cflags := $(default_cflags)
endif # $(+cflags) == ""
+# Add common benchmark CFLAGS
++cflags += $(bench-cflags)
+
+cflags += $(cflags-cpu) $(+gccwarn) $(+merge-constants) $(+math-flags)
+gcc-nowarn := -w
diff --git a/benchtests/Makefile b/benchtests/Makefile
index 3e794d7..d330abb 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -109,12 +109,16 @@ LDFLAGS-bench-slowatan = -lm
# Rules to build and execute the benchmarks. Do not put any benchmark
# parameters beyond this point.
+ifdef USE_CLOCK_GETTIME
+bench-cflags := -DUSE_CLOCK_GETTIME
+endif
+
include ../Makeconfig
include ../Rules
binaries-bench := $(addprefix $(objpfx)bench-,$(bench))
-bench-deps := bench-skeleton.c Makefile
+bench-deps := bench-skeleton.c bench-timing.h Makefile
run-bench = $(test-wrapper-env) \
GCONV_PATH=$(common-objpfx)iconvdata LC_ALL=C \
diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 13f986d..99a316e 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -17,59 +17,53 @@
<http://www.gnu.org/licenses/>. */
#include <string.h>
-#include <stdint.h>
#include <stdio.h>
#include <time.h>
#include <inttypes.h>
+#include "bench-timing.h"
int
main (int argc, char **argv)
{
unsigned long i, j, k;
- uint64_t total = 0, max = 0, min = 0x7fffffffffffffff;
- struct timespec start, end;
+ timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
+ timing_t start, end;
memset (&start, 0, sizeof (start));
memset (&end, 0, sizeof (end));
- clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start);
+ unsigned long iters;
+
+ TIMING_INIT (iters);
- /* Measure 1000 times the resolution of the clock. So for a 1ns resolution
- clock, we measure 1000 iterations of the function call at a time.
- Measurements close to the minimum clock resolution won't make much sense,
- but it's better than having nothing at all. */
- unsigned long iters = 1000 * start.tv_nsec;
unsigned long total_iters = ITER / iters;
for (i = 0; i < NUM_SAMPLES; i++)
{
for (j = 0; j < total_iters; j ++)
{
- clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &start);
+ int64_t cur;
+
+ TIMING_NOW (start);
for (k = 0; k < iters; k++)
BENCH_FUNC(i);
- clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &end);
-
- uint64_t cur = (end.tv_nsec - start.tv_nsec
- + ((end.tv_sec - start.tv_sec)
- * (uint64_t) 1000000000));
+ TIMING_NOW (end);
+ TIMING_DIFF (cur, start, end);
if (cur > max)
max = cur;
if (cur < min)
min = cur;
- total += cur;
+ TIMING_ACCUM (total, cur);
}
}
- double d_total_s = total * 1e-9;
+ double d_total_s = total;
double d_iters = iters;
double d_total_i = (double)ITER * NUM_SAMPLES;
- printf (FUNCNAME ": ITERS:%g: TOTAL:%gs, MAX:%gns, MIN:%gns, %g iter/s\n",
- d_total_i, d_total_s, max / d_iters, min / d_iters,
- d_total_i / d_total_s);
+ TIMING_PRINT_STATS (d_total_s, d_iters, d_total_i, max, min);
return 0;
}
diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h
new file mode 100644
index 0000000..e67a88d
--- /dev/null
+++ b/benchtests/bench-timing.h
@@ -0,0 +1,72 @@
+/* Define timing macros.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <hp-timing.h>
+#include <stdint.h>
+
+#if HP_TIMING_AVAIL && !defined USE_CLOCK_GETTIME
+# define GL(x) _##x
+# define GLRO(x) _##x
+hp_timing_t _dl_hp_timing_overhead;
+typedef hp_timing_t timing_t;
+
+# define TIMING_INIT(iters) \
+({ \
+ HP_TIMING_DIFF_INIT(); \
+ (iters) = 1000; \
+})
+
+# define TIMING_NOW(var) HP_TIMING_NOW (var)
+# define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end))
+# define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff))
+
+# define TIMING_PRINT_STATS(d_total_s, d_iters, d_total_i, max, min) \
+ printf (FUNCNAME ": ITERS:%g: TOTAL:%gC, MAX:%gC, MIN:%gC, %g calls/MC\n", \
+ (d_total_i), (d_total_s), (max) / (d_iters), (min) / (d_iters), \
+ 1e6 * (d_total_i) / (d_total_s));
+
+#else
+typedef uint64_t timing_t;
+
+/* Measure 1000 times the resolution of the clock. So for a 1ns
+ resolution clock, we measure 1000 iterations of the function call at a
+ time. Measurements close to the minimum clock resolution won't make
+ much sense, but it's better than having nothing at all. */
+# define TIMING_INIT(iters) \
+({ \
+ struct timespec start; \
+ clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start); \
+ (iters) = 1000 * start.tv_nsec; \
+})
+
+# define TIMING_NOW(var) \
+({ \
+ struct timespec tv; \
+ clock_gettime (CLOCK_PROCESS_CPUTIME_ID, &tv); \
+ (var) = (uint64_t) (tv.tv_nsec + (uint64_t) 1000000000 * tv.tv_sec); \
+})
+
+# define TIMING_DIFF(diff, start, end) (diff) = (end) - (start)
+# define TIMING_ACCUM(sum, diff) (sum) += (diff)
+
+# define TIMING_PRINT_STATS(d_total_s, d_iters, d_total_i, max, min) \
+ printf (FUNCNAME ": ITERS:%g: TOTAL:%gs, MAX:%gs, MIN:%gs, %g iter/s\n", \
+ (d_total_i), (d_total_s) * 1e9, (max) / (d_iters), \
+ (min) / (d_iters), (d_total_i) / ((d_total_s) * 1e9))
+
+#endif