This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH] Add malloc micro benchmark

From: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>
To: DJ Delorie <dj at redhat dot com>
Cc: "libc-alpha at sourceware dot org" <libc-alpha at sourceware dot org>, nd <nd at arm dot com>
Date: Thu, 14 Feb 2019 16:38:54 +0000
Subject: Re: [PATCH] Add malloc micro benchmark
References: <DB5PR08MB1030A3EB64DA81DB604E140183920@DB5PR08MB1030.eurprd08.prod.outlook.com> (message from Wilco Dijkstra on Fri, 1 Feb 2019 16:27:34 +0000),<xnd0o2cbrl.fsf@greed.delorie.com>

Hi DJ,

> Looks good to me, although I'd like some additional comments in the test
> code.

Thanks for the review - I've added some extra comments:

+/* Benchmark the malloc/free performance of a varying number of blocks of a
+   given size.  This enables performance tracking of the t-cache and fastbins.
+   It tests 3 different scenarios: single-threaded using main arena,
+   multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P
+   false.  */

> +       else \
> +             for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
> +               echo "Running $${run} $${thr}"; \
> +               $(run-bench) $${thr} > $${run}-$${thr}.out; \
> +             done;\
> +       fi;\
>        done

> I wonder if this could be done more elegantly, but I'm OK with a simple
> approach for now.  If we end up adding many more such tests we might
> need to revisit this part.

The main concern was to get a clean state so that the test of a previous block
size doesn't affect subsequent results.

> +#define NUM_ITERS 1000000
> +#define NUM_ALLOCS 4
> +#define MAX_ALLOCS 1600

> How long does this test take to run, on average, compared to other
> tests?  Do we have to worry about increasing timeouts for slow hosts?

All the tests together runs finish in a fraction of the time taken by a single
test of bench-malloc-thread, so if anything we need to reduce the time of
that one by an order of magnitude (it takes ~5 minutes!).

> +static void
> +do_benchmark (malloc_args *args, int **arr)
> +{
> +  timing_t start, stop;
> +  size_t iters = args->iters;
> +  size_t size = args->size;
> +  int n = args->n;
> +
> +  TIMING_NOW (start);
> +
> +  for (int j = 0; j < iters; j++)
> +    {
> +      for (int i = 0; i < n; i++)
> +     arr[i] = malloc (size);
> +
> +      for (int i = 0; i < n; i++)
> +     free (arr[i]);
> +    }
> +
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (args->elapsed, start, stop);
> +}

> Simple loop, but doesn't test for malloc returning NULL.

Yeah, the benchmark doesn't need to care since the amount we allocate
is tiny (6.4MBytes).

Cheers,
Wilco

I've committed this:

Add a malloc micro benchmark to enable accurate testing of the
various paths in malloc and free.  The benchmark does a varying
number of allocations of a given block size, then frees them again.

It tests 3 different scenarios: single-threaded using main arena,
multi-threaded using thread-arena, main arena with SINGLE_THREAD_P
false.

OK for commit?

ChangeLog:
2019-02-14  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/Makefile: Add malloc-simple benchmark.
	* benchtests/bench-malloc-simple.c: New benchmark.

--
diff --git a/benchtests/Makefile b/benchtests/Makefile
index 12036b1935dc7ea84b421f024d6fe3190ae35a6e..09f7cb8e475a312268eebb4d346edde70d22bb3d 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -90,7 +90,7 @@ CFLAGS-bench-trunc.c += -fno-builtin
 CFLAGS-bench-truncf.c += -fno-builtin
 
 ifeq (${BENCHSET},)
-bench-malloc := malloc-thread
+bench-malloc := malloc-thread malloc-simple
 else
 bench-malloc := $(filter malloc-%,${BENCHSET})
 endif
@@ -98,7 +98,7 @@ endif
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
 $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
-$(objpfx)bench-malloc-thread: $(shared-thread-library)
+$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library)
 
 
 
@@ -165,7 +165,7 @@ bench-clean:
 ifneq ($(strip ${BENCHSET}),)
 VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
    wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread
+   malloc-thread malloc-simple
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
@@ -194,10 +194,18 @@ bench-set: $(binaries-benchset)
 
 bench-malloc: $(binaries-bench-malloc)
 	for run in $^; do \
+	  echo "$${run}"; \
+	  if [ `basename $${run}` = "bench-malloc-thread" ]; then \
 		for thr in 1 8 16 32; do \
 			echo "Running $${run} $${thr}"; \
-	  $(run-bench) $${thr} > $${run}-$${thr}.out; \
-	  done;\
+			$(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  else \
+		for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
+		  echo "Running $${run} $${thr}"; \
+		  $(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  fi;\
 	done
 
 # Build and execute the benchmark functions.  This target generates JSON
diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
new file mode 100644
index 0000000000000000000000000000000000000000..83203ff3187654a1710c9ef81016f854957b9d64
--- /dev/null
+++ b/benchtests/bench-malloc-simple.c
@@ -0,0 +1,188 @@
+/* Benchmark malloc and free functions.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <sys/resource.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+/* Benchmark the malloc/free performance of a varying number of blocks of a
+   given size.  This enables performance tracking of the t-cache and fastbins.
+   It tests 3 different scenarios: single-threaded using main arena,
+   multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P
+   false.  */
+
+#define NUM_ITERS 200000
+#define NUM_ALLOCS 4
+#define MAX_ALLOCS 1600
+
+typedef struct
+{
+  size_t iters;
+  size_t size;
+  int n;
+  timing_t elapsed;
+} malloc_args;
+
+static void
+do_benchmark (malloc_args *args, int **arr)
+{
+  timing_t start, stop;
+  size_t iters = args->iters;
+  size_t size = args->size;
+  int n = args->n;
+
+  TIMING_NOW (start);
+
+  for (int j = 0; j < iters; j++)
+    {
+      for (int i = 0; i < n; i++)
+	arr[i] = malloc (size);
+
+      for (int i = 0; i < n; i++)
+	free (arr[i]);
+    }
+
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (args->elapsed, start, stop);
+}
+
+static malloc_args tests[3][NUM_ALLOCS];
+static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS };
+
+static void *
+thread_test (void *p)
+{
+  int **arr = (int**)p;
+
+  /* Run benchmark multi-threaded.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[2][i], arr);
+
+  return p;
+}
+
+void
+bench (unsigned long size)
+{
+  size_t iters = NUM_ITERS;
+  int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
+  unsigned long res;
+
+  TIMING_INIT (res);
+
+  for (int t = 0; t <= 3; t++)
+    for (int i = 0; i < NUM_ALLOCS; i++)
+      {
+	tests[t][i].n = allocs[i];
+	tests[t][i].size = size;
+	tests[t][i].iters = iters / allocs[i];
+
+	/* Do a quick warmup run.  */
+	if (t == 0)
+	  do_benchmark (&tests[0][i], arr);
+      }
+
+  /* Run benchmark single threaded in main_arena.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[0][i], arr);
+
+  /* Run benchmark in a thread_arena.  */
+  pthread_t t;
+  pthread_create (&t, NULL, thread_test, (void*)arr);
+  pthread_join (t, NULL);
+
+  /* Repeat benchmark in main_arena with SINGLE_THREAD_P == false.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[1][i], arr);
+
+  free (arr);
+
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+
+  json_attr_object_begin (&json_ctx, "malloc");
+
+  char s[100];
+  double iters2 = iters;
+
+  json_attr_object_begin (&json_ctx, "");
+  json_attr_double (&json_ctx, "malloc_block_size", size);
+
+  struct rusage usage;
+  getrusage (RUSAGE_SELF, &usage);
+  json_attr_double (&json_ctx, "max_rss", usage.ru_maxrss);
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_st_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[0][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_mt_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[1][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "thread_arena__allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[2][i].elapsed / iters2);
+    }
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_document_end (&json_ctx);
+}
+
+static void usage (const char *name)
+{
+  fprintf (stderr, "%s: <alloc_size>\n", name);
+  exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+  long val = 16;
+  if (argc == 2)
+    val = strtol (argv[1], NULL, 0);
+
+  if (argc > 2 || val <= 0)
+    usage (argv[0]);
+
+  bench (val);
+
+  return 0;
+}

Follow-Ups:
- Re: [PATCH] Add malloc micro benchmark
  - From: DJ Delorie

References:
- [PATCH] Add malloc micro benchmark
  - From: Wilco Dijkstra
- Re: [PATCH] Add malloc micro benchmark
  - From: DJ Delorie

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]