[PATCH v2 1/6] x86-64: Create microbenchmark infrastructure for libmvec
Sunil Pandey
skpgkp2@gmail.com
Sun Nov 14 02:59:36 GMT 2021
On Sat, Nov 13, 2021 at 11:48 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Fri, Nov 12, 2021 at 2:51 PM Sunil Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Fri, Nov 12, 2021 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com>
> > wrote:
> >
> > > On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > Add python script to generate libmvec microbenchmark from the input
> > > > values for each libmvec function using skeleton benchmark template.
> > > >
> > > > Creates double and float benchmarks with vector length 1, 2, 4, 8,
> > > > and 16 for each libmvec function. Vector length 1 corresponds to
> > > > scalar version of function and is included for vector function perf
> > > > comparison.
> > > > ---
> > > > sysdeps/x86_64/fpu/Makeconfig | 35 ++
> > > > sysdeps/x86_64/fpu/Makefile | 40 ++
> > > > sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++
> > > > sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464
> ++++++++++++++++++++
> > > > 4 files changed, 643 insertions(+)
> > > > create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > > create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > > >
> > > > diff --git a/sysdeps/x86_64/fpu/Makeconfig
> > > b/sysdeps/x86_64/fpu/Makeconfig
> > > > index 24aaee1a43..503e9b5ffa 100644
> > > > --- a/sysdeps/x86_64/fpu/Makeconfig
> > > > +++ b/sysdeps/x86_64/fpu/Makeconfig
> > > > @@ -29,6 +29,23 @@ libmvec-funcs = \
> > > > sin \
> > > > sincos \
> > > >
> > > > +# Define libmvec function for benchtests directory.
> > > > +libmvec-bench-funcs = \
> > > > +
> > > > +bench-libmvec-double = \
> > > > + $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \
> > > > + $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \
> > > > + $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \
> > > > + $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \
> > > > + $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \
> > > > +
> > > > +bench-libmvec-float = \
> > > > + $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs)))
> \
> > > > + $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs)))
> \
> > > > + $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs)))
> \
> > > > + $(addsuffix f, $(addprefix float-vlen8-avx2-,
> > > $(libmvec-bench-funcs))) \
> > > > + $(addsuffix f, $(addprefix float-vlen16-,
> $(libmvec-bench-funcs))) \
> > > > +
> > > > # The base libmvec ABI tests.
> > > > libmvec-abi-func-tests = \
> > > > $(addprefix test-double-libmvec-,$(libmvec-funcs)) \
> > > > @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk:
> > > $(common-objpfx)config.make
> > > > echo " \$$(float-vlen16-arch-ext-cflags)"; \
> > > > echo; \
> > > > done; \
> > > > + echo "endif"; \
> > > > + echo "ifeq (\$$(subdir),benchtests)"; \
> > > > + for t in $(libmvec-bench-funcs); do \
> > > > + echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \
> > > > + echo " \$$(double-vlen4-arch-ext-cflags)"; \
> > > > + echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \
> > > > + echo " \$$(double-vlen4-arch-ext2-cflags)"; \
> > > > + echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \
> > > > + echo " \$$(double-vlen8-arch-ext-cflags)"; \
> > > > + echo; \
> > > > + echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \
> > > > + echo " \$$(float-vlen8-arch-ext-cflags)"; \
> > > > + echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \
> > > > + echo " \$$(float-vlen8-arch-ext2-cflags)"; \
> > > > + echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \
> > > > + echo " \$$(float-vlen16-arch-ext-cflags)"; \
> > > > + echo; \
> > > > + done; \
> > > > echo "endif") > $@T
> > > > mv -f $@T $@
> > > > diff --git a/sysdeps/x86_64/fpu/Makefile
> b/sysdeps/x86_64/fpu/Makefile
> > > > index d172ae815d..9fb587cf8f 100644
> > > > --- a/sysdeps/x86_64/fpu/Makefile
> > > > +++ b/sysdeps/x86_64/fpu/Makefile
> > > > @@ -72,3 +72,43 @@ ifeq
> > > ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
> > > > # performance of sin and cos by more than 40% on Skylake.
> > > > CFLAGS-branred.c = -mprefer-vector-width=128
> > > > endif
> > > > +
> > > > +ifeq ($(subdir),benchtests)
> > > > +double-vlen4-arch-ext-cflags = -mavx
> > > > +double-vlen4-arch-ext2-cflags = -mavx2
> > > > +double-vlen8-arch-ext-cflags = -mavx512f
> > > > +
> > > > +float-vlen8-arch-ext-cflags = -mavx
> > > > +float-vlen8-arch-ext2-cflags = -mavx2
> > > > +float-vlen16-arch-ext-cflags = -mavx512f
> > > > +
> > > > +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float)
> > > > +
> > > > +ifeq (${BENCHSET},)
> > > > +bench += $(bench-libmvec)
> > > > +endif
> > > > +
> > > > +ifeq (${STATIC-BENCHTESTS},yes)
> > > > +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a
> > > $(common-objpfx)math/libm.a
> > > > +else
> > > > +libmvec-benchtests = $(libmvec) $(libm)
> > > > +endif
> > > > +
> > > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)):
> > > $(libmvec-benchtests)
> > > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)):
> > > $(libmvec-benchtests)
> > > > +bench-libmvec-deps =
> $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > bench-timing.h Makefile
> > > > +
> > > > +$(objpfx)bench-float-%.c: $(bench-libmvec-deps)
> > > > + { if [ -n "$($*-INCLUDE)" ]; then \
> > > > + cat $($*-INCLUDE); \
> > > > + fi; \
> > > > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > > $(basename $(@F)); } > $@-tmp
> > > > + mv -f $@-tmp $@
> > > > +
> > > > +$(objpfx)bench-double-%.c: $(bench-libmvec-deps)
> > > > + { if [ -n "$($*-INCLUDE)" ]; then \
> > > > + cat $($*-INCLUDE); \
> > > > + fi; \
> > > > + $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > > $(basename $(@F)); } > $@-tmp
> > > > + mv -f $@-tmp $@
> > > > +endif
> > > > diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > > new file mode 100644
> > > > index 0000000000..d56a0c4462
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > > @@ -0,0 +1,104 @@
> > > > +/* Skeleton for libmvec benchmark programs.
> > > > + Copyright (C) 2021 Free Software Foundation, Inc.
> > > > + This file is part of the GNU C Library.
> > > > +
> > > > + The GNU C Library is free software; you can redistribute it
> and/or
> > > > + modify it under the terms of the GNU Lesser General Public
> > > > + License as published by the Free Software Foundation; either
> > > > + version 2.1 of the License, or (at your option) any later
> version.
> > > > +
> > > > + The GNU C Library is distributed in the hope that it will be
> useful,
> > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > > > + Lesser General Public License for more details.
> > > > +
> > > > + You should have received a copy of the GNU Lesser General Public
> > > > + License along with the GNU C Library; if not, see
> > > > + <https://www.gnu.org/licenses/>. */
> > > > +
> > > > +#include <string.h>
> > > > +#include <stdint.h>
> > > > +#include <stdbool.h>
> > > > +#include <stdio.h>
> > > > +#include <time.h>
> > > > +#include <inttypes.h>
> > > > +#include <bench-timing.h>
> > > > +#include <json-lib.h>
> > > > +#include <bench-util.h>
> > > > +
> > > > +#include <bench-util.c>
> > > > +#include <math-tests-arch.h>
> > > > +#define D_ITERS 10000
> > > > +
> > > > +int
> > > > +main (int argc, char **argv)
> > > > +{
> > > > + unsigned long i, k;
> > > > + timing_t start, end;
> > > > + json_ctx_t json_ctx;
> > > > +
> > > > +#if defined REQUIRE_AVX
> > > > + if (!CPU_FEATURE_ACTIVE (AVX))
> > > > + {
> > > > + printf ("AVX not supported.\n");
> > > > + return 0;
> > > > + }
> > > > +#elif defined REQUIRE_AVX2
> > > > + if (!CPU_FEATURE_ACTIVE (AVX2))
> > > > + {
> > > > + printf ("AVX2 not supported.\n");
> > > > + return 0;
> > > > + }
> > > > +#elif defined REQUIRE_AVX512F
> > > > + if (!CPU_FEATURE_ACTIVE (AVX512F))
> > > > + {
> > > > + printf ("AVX512F not supported.\n");
> > > > + return 0;
> > > > + }
> > > > +#endif
> > > > +
> > > > + bench_start ();
> > > > +
> > > > +#ifdef BENCH_INIT
> > > > + BENCH_INIT ();
> > > > +#endif
> > > > +
> > > > + json_init (&json_ctx, 2, stdout);
> > > > +
> > > > + /* Begin function. */
> > > > + json_attr_object_begin (&json_ctx, FUNCNAME);
> > > > +
> > > > + for (int v = 0; v < NUM_VARIANTS; v++)
> > > > + {
> > > > + double d_total_time = 0;
> > > > + uint64_t cur;
> > >
> > > Think these should also be type `timing_t`
> > >
> >
> > I do not see a difference if I use timing_t or uint64_t. In any case
> > variable cur stores the
> > difference between start and end time, not time.
> >
> >
> > >
> > > > + for (k = 0; k < D_ITERS; k++)
> > > > + {
> > > > + TIMING_NOW (start);
> > > > + for (i = 0; i < NUM_SAMPLES (v); i++)
> > >
> > > What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not
> > > one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`?
> > >
> >
> > D_ITERS define how many times each variant full data set will run.
> > NUM_SAMPLES(v)
> > represent the number of data sets in variant v. Index v and i select,
> i'th
> > data set from
> > variant v and call vector function. Having two loops simplifies logic.
> >
> >
> > > > + BENCH_FUNC (v, i);
> > > > + TIMING_NOW (end);
> > > > +
> > > > + TIMING_DIFF (cur, start, end);
> > > > +
> > > > + d_total_time += cur;
> > >.> > Think this should be `TIMING_ACCUM(d_total_time, cur)`.
> > >
> >
> > Not much difference, if I use TIMING_ACCUM or simply add cur to
> > d_total_time.
> >
>
> Please use TIMING_ACCUM (d_total_time, cur) to be consistent with
> TIMING_DIFF (cur, start, end).
>
Sure, I will fix it in the next version.
>
> Thanks.
>
>
> --
> H.J.
>
More information about the Libc-alpha
mailing list