This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH 1/2] benchtests: Memory walking benchmark for memcpy

From: Siddhesh Poyarekar <siddhesh at sourceware dot org>
To: libc-alpha at sourceware dot org
Date: Thu, 21 Sep 2017 11:59:28 +0530
Subject: Re: [PATCH 1/2] benchtests: Memory walking benchmark for memcpy
Authentication-results: sourceware.org; auth=none
References: <1505756414-12857-1-git-send-email-siddhesh@sourceware.org>
Reply-to: siddhesh at sourceware dot org

Ping, any comments on this new benchmark?

Siddhesh

On Monday 18 September 2017 11:10 PM, Siddhesh Poyarekar wrote:
> This benchmark is an attempt to eliminate cache effects from string
> benchmarks.  The benchmark walks both ways through a large memory area
> and copies different sizes of memory and alignments one at a time
> instead of looping around in the same memory area.  This is a good
> metric to have alongside the other memcpy benchmarks, especially for
> larger sizes where the likelihood of the call being done only once is
> pretty high.
> 
> The benchmark is unlike other string benchmarks in that it prints the
> total data rate achieved during a walk across the memory and not the
> time taken per execution.
> 
> 	* benchtests/bench-memcpy-walk.c: New file.
> 	* benchtests/Makefile (string-benchset): Add it.
> 
> ---
>  benchtests/Makefile            |   3 +-
>  benchtests/bench-memcpy-walk.c | 126 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 128 insertions(+), 1 deletion(-)
>  create mode 100644 benchtests/bench-memcpy-walk.c
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index a0c3470..fbdeadf 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -37,7 +37,8 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
>  		   strcat strchr strchrnul strcmp strcpy strcspn strlen \
>  		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
>  		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> -		   strcoll memcpy-large memcpy-random memmove-large memset-large
> +		   strcoll memcpy-large memcpy-random memmove-large memset-large \
> +		   memcpy-walk
>  
>  # Build and run locale-dependent benchmarks only if we're building natively.
>  ifeq (no,$(cross-compiling))
> diff --git a/benchtests/bench-memcpy-walk.c b/benchtests/bench-memcpy-walk.c
> new file mode 100644
> index 0000000..df6aa33
> --- /dev/null
> +++ b/benchtests/bench-memcpy-walk.c
> @@ -0,0 +1,126 @@
> +/* Measure memcpy function combined throughput for different alignments.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* This microbenchmark measures the throughput of memcpy for various sizes from
> +   1 byte to 32MiB, doubling every iteration and then misaligning by 0-15
> +   bytes.  The copies are done from source to destination and then back and the
> +   source walks forward across the array and the destination walks backward by
> +   one byte each, thus measuring misaligned accesses as well.  The idea is to
> +   avoid caching effects by copying a different string and far enough from each
> +   other, walking in different directions so that we can measure prefetcher
> +   efficiency (software or hardware) more closely than with a loop copying the
> +   same data over and over, which eventually only gives us L1 cache
> +   performance.  */
> +
> +#ifndef MEMCPY_RESULT
> +# define MEMCPY_RESULT(dst, len) dst
> +# define START_SIZE 1
> +# define MIN_PAGE_SIZE (getpagesize () + 32 * 1024 * 1024)
> +# define TEST_MAIN
> +# define TEST_NAME "memcpy"
> +# define TIMEOUT (20 * 60)
> +# include "bench-string.h"
> +
> +IMPL (memcpy, 1)
> +#endif
> +
> +#include "json-lib.h"
> +
> +typedef char *(*proto_t) (char *, const char *, size_t);
> +
> +static void
> +do_one_test (json_ctx_t *json_ctx, impl_t *impl, char *dst, char *src,
> +	     size_t len)
> +{
> +  size_t i, iters = MIN_PAGE_SIZE;
> +  timing_t start, stop, cur;
> +
> +  char *dst_end = dst + MIN_PAGE_SIZE - len;
> +  char *src_end = src + MIN_PAGE_SIZE - len;
> +
> +  TIMING_NOW (start);
> +  /* Copy the entire buffer back and forth, LEN at a time.  */
> +  for (i = 0; i < iters && dst_end >= dst && src <= src_end; src++, dst_end--)
> +    {
> +      CALL (impl, dst_end, src, len);
> +      CALL (impl, src, dst_end, len);
> +      i += (len << 1);
> +    }
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  json_element_double (json_ctx, (double) iters / (double) cur);
> +}
> +
> +static void
> +do_test (json_ctx_t *json_ctx, size_t len)
> +{
> +  json_element_object_begin (json_ctx);
> +  json_attr_uint (json_ctx, "length", (double) len);
> +  json_array_begin (json_ctx, "timings");
> +
> +  FOR_EACH_IMPL (impl, 0)
> +    do_one_test (json_ctx, impl, (char *) buf2, (char *) buf1, len);
> +
> +  json_array_end (json_ctx);
> +  json_element_object_end (json_ctx);
> +}
> +
> +int
> +test_main (void)
> +{
> +  json_ctx_t json_ctx;
> +  size_t i;
> +
> +  test_init ();
> +
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  json_attr_object_begin (&json_ctx, "functions");
> +  json_attr_object_begin (&json_ctx, "memcpy");
> +  json_attr_string (&json_ctx, "bench-variant", "throughput");

I've changed this to "walk" since this may not be the only throughput
benchmark.

> +
> +  json_array_begin (&json_ctx, "ifuncs");
> +  FOR_EACH_IMPL (impl, 0)
> +    json_element_string (&json_ctx, impl->name);
> +  json_array_end (&json_ctx);
> +
> +  json_array_begin (&json_ctx, "results");
> +  for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
> +    {
> +      /* Test length alignments from 0-16 bytes.  */
> +      for (int j = 0; j < 8; j++)
> +	{
> +	  do_test (&json_ctx, i + j);
> +	  do_test (&json_ctx, i + 16 - j);
> +	}
> +    }
> +
> +  json_array_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_attr_object_end (&json_ctx);
> +  json_document_end (&json_ctx);
> +
> +  return ret;
> +}
> +
> +#include <support/test-driver.c>
>

Follow-Ups:
- Re: [PATCH 1/2] benchtests: Memory walking benchmark for memcpy
  - From: Rajalakshmi Srinivasaraghavan

References:
- [PATCH 1/2] benchtests: Memory walking benchmark for memcpy
  - From: Siddhesh Poyarekar

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]