This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1/2] benchtests: Memory walking benchmark for memcpy
- From: Siddhesh Poyarekar <siddhesh at sourceware dot org>
- To: libc-alpha at sourceware dot org
- Date: Thu, 21 Sep 2017 11:59:28 +0530
- Subject: Re: [PATCH 1/2] benchtests: Memory walking benchmark for memcpy
- Authentication-results: sourceware.org; auth=none
- References: <1505756414-12857-1-git-send-email-siddhesh@sourceware.org>
- Reply-to: siddhesh at sourceware dot org
Ping, any comments on this new benchmark?
Siddhesh
On Monday 18 September 2017 11:10 PM, Siddhesh Poyarekar wrote:
> This benchmark is an attempt to eliminate cache effects from string
> benchmarks. The benchmark walks both ways through a large memory area
> and copies different sizes of memory and alignments one at a time
> instead of looping around in the same memory area. This is a good
> metric to have alongside the other memcpy benchmarks, especially for
> larger sizes where the likelihood of the call being done only once is
> pretty high.
>
> The benchmark is unlike other string benchmarks in that it prints the
> total data rate achieved during a walk across the memory and not the
> time taken per execution.
>
> * benchtests/bench-memcpy-walk.c: New file.
> * benchtests/Makefile (string-benchset): Add it.
>
> ---
> benchtests/Makefile | 3 +-
> benchtests/bench-memcpy-walk.c | 126 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 128 insertions(+), 1 deletion(-)
> create mode 100644 benchtests/bench-memcpy-walk.c
>
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index a0c3470..fbdeadf 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -37,7 +37,8 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
> strcat strchr strchrnul strcmp strcpy strcspn strlen \
> strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
> strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> - strcoll memcpy-large memcpy-random memmove-large memset-large
> + strcoll memcpy-large memcpy-random memmove-large memset-large \
> + memcpy-walk
>
> # Build and run locale-dependent benchmarks only if we're building natively.
> ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memcpy-walk.c b/benchtests/bench-memcpy-walk.c
> new file mode 100644
> index 0000000..df6aa33
> --- /dev/null
> +++ b/benchtests/bench-memcpy-walk.c
> @@ -0,0 +1,126 @@
> +/* Measure memcpy function combined throughput for different alignments.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +/* This microbenchmark measures the throughput of memcpy for various sizes from
> + 1 byte to 32MiB, doubling every iteration and then misaligning by 0-15
> + bytes. The copies are done from source to destination and then back and the
> + source walks forward across the array and the destination walks backward by
> + one byte each, thus measuring misaligned accesses as well. The idea is to
> + avoid caching effects by copying a different string and far enough from each
> + other, walking in different directions so that we can measure prefetcher
> + efficiency (software or hardware) more closely than with a loop copying the
> + same data over and over, which eventually only gives us L1 cache
> + performance. */
> +
> +#ifndef MEMCPY_RESULT
> +# define MEMCPY_RESULT(dst, len) dst
> +# define START_SIZE 1
> +# define MIN_PAGE_SIZE (getpagesize () + 32 * 1024 * 1024)
> +# define TEST_MAIN
> +# define TEST_NAME "memcpy"
> +# define TIMEOUT (20 * 60)
> +# include "bench-string.h"
> +
> +IMPL (memcpy, 1)
> +#endif
> +
> +#include "json-lib.h"
> +
> +typedef char *(*proto_t) (char *, const char *, size_t);
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
> + size_t len)
> +{
> + size_t i, iters = MIN_PAGE_SIZE;
> + timing_t start, stop, cur;
> +
> + char *dst_end = dst + MIN_PAGE_SIZE - len;
> + char *src_end = src + MIN_PAGE_SIZE - len;
> +
> + TIMING_NOW (start);
> + /* Copy the entire buffer back and forth, LEN at a time. */
> + for (i = 0; i < iters && dst_end >= dst && src <= src_end; src++, dst_end--)
> + {
> + CALL (impl, dst_end, src, len);
> + CALL (impl, src, dst_end, len);
> + i += (len << 1);
> + }
> + TIMING_NOW (stop);
> +
> + TIMING_DIFF (cur, start, stop);
> +
> + json_element_double (json_ctx, (double) iters / (double) cur);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t len)
> +{
> + json_element_object_begin (json_ctx);
> + json_attr_uint (json_ctx, "length", (double) len);
> + json_array_begin (json_ctx, "timings");
> +
> + FOR_EACH_IMPL (impl, 0)
> + do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
> +
> + json_array_end (json_ctx);
> + json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> + json_ctx_t json_ctx;
> + size_t i;
> +
> + test_init ();
> +
> + json_init (&json_ctx, 0, stdout);
> +
> + json_document_begin (&json_ctx);
> + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> + json_attr_object_begin (&json_ctx, "functions");
> + json_attr_object_begin (&json_ctx, "memcpy");
> + json_attr_string (&json_ctx, "bench-variant", "throughput");
I've changed this to "walk" since this may not be the only throughput
benchmark.
> +
> + json_array_begin (&json_ctx, "ifuncs");
> + FOR_EACH_IMPL (impl, 0)
> + json_element_string (&json_ctx, impl->name);
> + json_array_end (&json_ctx);
> +
> + json_array_begin (&json_ctx, "results");
> + for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> + {
> + /* Test length alignments from 0-16 bytes. */
> + for (int j = 0; j < 8; j++)
> + {
> + do_test (&json_ctx, i + j);
> + do_test (&json_ctx, i + 16 - j);
> + }
> + }
> +
> + json_array_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_attr_object_end (&json_ctx);
> + json_document_end (&json_ctx);
> +
> + return ret;
> +}
> +
> +#include <support/test-driver.c>
>