diff mbox series

Add malloc micro benchmark

Message ID DB5PR08MB1030A3EB64DA81DB604E140183920@DB5PR08MB1030.eurprd08.prod.outlook.com
State New
Headers show
Series Add malloc micro benchmark | expand

Commit Message

Wilco Dijkstra Feb. 1, 2019, 4:27 p.m. UTC
Add a malloc micro benchmark to enable accurate testing of the
various paths in malloc and free.  The benchmark does a varying
number of allocations of a given block size, then frees them again.

It tests 3 different scenarios: single-threaded using main arena,
multi-threaded using thread-arena, main arena with SINGLE_THREAD_P
false.

OK for commit?

ChangeLog:
2019-02-01  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/Makefile: Add malloc-simple benchmark.
	* benchtests/bench-malloc-simple.c: New benchmark.

--

Comments

Wilco Dijkstra Feb. 14, 2019, 4:38 p.m. UTC | #1
Hi DJ,

> Looks good to me, although I'd like some additional comments in the test
> code.

Thanks for the review - I've added some extra comments:

+/* Benchmark the malloc/free performance of a varying number of blocks of a
+   given size.  This enables performance tracking of the t-cache and fastbins.
+   It tests 3 different scenarios: single-threaded using main arena,
+   multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P
+   false.  */

> +       else \
> +             for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
> +               echo "Running $${run} $${thr}"; \
> +               $(run-bench) $${thr} > $${run}-$${thr}.out; \
> +             done;\
> +       fi;\
>        done

> I wonder if this could be done more elegantly, but I'm OK with a simple
> approach for now.  If we end up adding many more such tests we might
> need to revisit this part.

The main concern was to get a clean state so that the test of a previous block
size doesn't affect subsequent results.

> +#define NUM_ITERS 1000000
> +#define NUM_ALLOCS 4
> +#define MAX_ALLOCS 1600

> How long does this test take to run, on average, compared to other
> tests?  Do we have to worry about increasing timeouts for slow hosts?

All the tests together runs finish in a fraction of the time taken by a single
test of bench-malloc-thread, so if anything we need to reduce the time of
that one by an order of magnitude (it takes ~5 minutes!).

> +static void
> +do_benchmark (malloc_args *args, int **arr)
> +{
> +  timing_t start, stop;
> +  size_t iters = args->iters;
> +  size_t size = args->size;
> +  int n = args->n;
> +
> +  TIMING_NOW (start);
> +
> +  for (int j = 0; j < iters; j++)
> +    {
> +      for (int i = 0; i < n; i++)
> +     arr[i] = malloc (size);
> +
> +      for (int i = 0; i < n; i++)
> +     free (arr[i]);
> +    }
> +
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (args->elapsed, start, stop);
> +}

> Simple loop, but doesn't test for malloc returning NULL.

Yeah, the benchmark doesn't need to care since the amount we allocate
is tiny (6.4MBytes).

Cheers,
Wilco

I've committed this:

Add a malloc micro benchmark to enable accurate testing of the
various paths in malloc and free.  The benchmark does a varying
number of allocations of a given block size, then frees them again.

It tests 3 different scenarios: single-threaded using main arena,
multi-threaded using thread-arena, main arena with SINGLE_THREAD_P
false.

OK for commit?

ChangeLog:
2019-02-14  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/Makefile: Add malloc-simple benchmark.
	* benchtests/bench-malloc-simple.c: New benchmark.

--
diff --git a/benchtests/Makefile b/benchtests/Makefile
index 12036b1935dc7ea84b421f024d6fe3190ae35a6e..09f7cb8e475a312268eebb4d346edde70d22bb3d 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -90,7 +90,7 @@ CFLAGS-bench-trunc.c += -fno-builtin
 CFLAGS-bench-truncf.c += -fno-builtin
 
 ifeq (${BENCHSET},)
-bench-malloc := malloc-thread
+bench-malloc := malloc-thread malloc-simple
 else
 bench-malloc := $(filter malloc-%,${BENCHSET})
 endif
@@ -98,7 +98,7 @@ endif
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
 $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
-$(objpfx)bench-malloc-thread: $(shared-thread-library)
+$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library)
 
 
 
@@ -165,7 +165,7 @@ bench-clean:
 ifneq ($(strip ${BENCHSET}),)
 VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
    wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread
+   malloc-thread malloc-simple
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
@@ -194,10 +194,18 @@ bench-set: $(binaries-benchset)
 
 bench-malloc: $(binaries-bench-malloc)
 	for run in $^; do \
+	  echo "$${run}"; \
+	  if [ `basename $${run}` = "bench-malloc-thread" ]; then \
 		for thr in 1 8 16 32; do \
 			echo "Running $${run} $${thr}"; \
-	  $(run-bench) $${thr} > $${run}-$${thr}.out; \
-	  done;\
+			$(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  else \
+		for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
+		  echo "Running $${run} $${thr}"; \
+		  $(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  fi;\
 	done
 
 # Build and execute the benchmark functions.  This target generates JSON
diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
new file mode 100644
index 0000000000000000000000000000000000000000..83203ff3187654a1710c9ef81016f854957b9d64
--- /dev/null
+++ b/benchtests/bench-malloc-simple.c
@@ -0,0 +1,188 @@
+/* Benchmark malloc and free functions.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <sys/resource.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+/* Benchmark the malloc/free performance of a varying number of blocks of a
+   given size.  This enables performance tracking of the t-cache and fastbins.
+   It tests 3 different scenarios: single-threaded using main arena,
+   multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P
+   false.  */
+
+#define NUM_ITERS 200000
+#define NUM_ALLOCS 4
+#define MAX_ALLOCS 1600
+
+typedef struct
+{
+  size_t iters;
+  size_t size;
+  int n;
+  timing_t elapsed;
+} malloc_args;
+
+static void
+do_benchmark (malloc_args *args, int **arr)
+{
+  timing_t start, stop;
+  size_t iters = args->iters;
+  size_t size = args->size;
+  int n = args->n;
+
+  TIMING_NOW (start);
+
+  for (int j = 0; j < iters; j++)
+    {
+      for (int i = 0; i < n; i++)
+	arr[i] = malloc (size);
+
+      for (int i = 0; i < n; i++)
+	free (arr[i]);
+    }
+
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (args->elapsed, start, stop);
+}
+
+static malloc_args tests[3][NUM_ALLOCS];
+static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS };
+
+static void *
+thread_test (void *p)
+{
+  int **arr = (int**)p;
+
+  /* Run benchmark multi-threaded.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[2][i], arr);
+
+  return p;
+}
+
+void
+bench (unsigned long size)
+{
+  size_t iters = NUM_ITERS;
+  int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
+  unsigned long res;
+
+  TIMING_INIT (res);
+
+  for (int t = 0; t <= 3; t++)
+    for (int i = 0; i < NUM_ALLOCS; i++)
+      {
+	tests[t][i].n = allocs[i];
+	tests[t][i].size = size;
+	tests[t][i].iters = iters / allocs[i];
+
+	/* Do a quick warmup run.  */
+	if (t == 0)
+	  do_benchmark (&tests[0][i], arr);
+      }
+
+  /* Run benchmark single threaded in main_arena.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[0][i], arr);
+
+  /* Run benchmark in a thread_arena.  */
+  pthread_t t;
+  pthread_create (&t, NULL, thread_test, (void*)arr);
+  pthread_join (t, NULL);
+
+  /* Repeat benchmark in main_arena with SINGLE_THREAD_P == false.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[1][i], arr);
+
+  free (arr);
+
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+
+  json_attr_object_begin (&json_ctx, "malloc");
+
+  char s[100];
+  double iters2 = iters;
+
+  json_attr_object_begin (&json_ctx, "");
+  json_attr_double (&json_ctx, "malloc_block_size", size);
+
+  struct rusage usage;
+  getrusage (RUSAGE_SELF, &usage);
+  json_attr_double (&json_ctx, "max_rss", usage.ru_maxrss);
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_st_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[0][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_mt_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[1][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "thread_arena__allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[2][i].elapsed / iters2);
+    }
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_document_end (&json_ctx);
+}
+
+static void usage (const char *name)
+{
+  fprintf (stderr, "%s: <alloc_size>\n", name);
+  exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+  long val = 16;
+  if (argc == 2)
+    val = strtol (argv[1], NULL, 0);
+
+  if (argc > 2 || val <= 0)
+    usage (argv[0]);
+
+  bench (val);
+
+  return 0;
+}
Carlos O'Donell Feb. 28, 2019, 4:52 a.m. UTC | #2
On 2/1/19 11:27 AM, Wilco Dijkstra wrote:
> Add a malloc micro benchmark to enable accurate testing of the
> various paths in malloc and free.  The benchmark does a varying
> number of allocations of a given block size, then frees them again.
> 
> It tests 3 different scenarios: single-threaded using main arena,
> multi-threaded using thread-arena, main arena with SINGLE_THREAD_P
> false.
> 
> OK for commit?
> 
> ChangeLog:
> 2019-02-01  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* benchtests/Makefile: Add malloc-simple benchmark.
> 	* benchtests/bench-malloc-simple.c: New benchmark.

This broke Fedora Rawhide during CI testing:

BUILDSTDERR: bench-malloc-simple.c: In function 'bench':
BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable]
BUILDSTDERR:    89 |   unsigned long res;
BUILDSTDERR:       |                 ^~~
BUILDSTDERR: cc1: all warnings being treated as errors

Affects aarch64, armv7hl, and s390x.

I assume we need a "(void) res" like we have in bench-malloc-thread.c?

I'm going to checkin a quick fix to Rawhide and report back if anything
else breaks.
Wilco Dijkstra March 4, 2019, 5:35 p.m. UTC | #3
Hi Carlos,

> BUILDSTDERR: bench-malloc-simple.c: In function 'bench':
> BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable]
> BUILDSTDERR:    89 |   unsigned long res;
> BUILDSTDERR:       |                 ^~~
> BUILDSTDERR: cc1: all warnings being treated as errors
>
> Affects aarch64, armv7hl, and s390x.
> 
> I assume we need a "(void) res" like we have in bench-malloc-thread.c?
> 
> I'm going to checkin a quick fix to Rawhide and report back if anything
> else breaks.

Does that enable extra errors somehow? I can't reproduce it.

Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a
patch to just kill it:


Remove TIMING_INIT since it's only used in bench-skeleton.c if there
is no hp-timing support (which will become the default after [1]).

[1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html

ChangeLog:
2019-03-04  Wilco Dijkstra  <wdijkstr@arm.com>

	* benchtests/bench-malloc-simple.c: Remove TIMING_INIT.
	* benchtests/bench-malloc-thread.c: Likewise.
	* benchtests/bench-skeleton.c: Likewise.
	* benchtests/bench-strtod.c: Likewise.
	* benchtests/bench-timing.h: Likewise.

--

diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644
--- a/benchtests/bench-malloc-simple.c
+++ b/benchtests/bench-malloc-simple.c
@@ -86,9 +86,6 @@ bench (unsigned long size)
 {
   size_t iters = NUM_ITERS;
   int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
-  unsigned long res;
-
-  TIMING_INIT (res);
 
   for (int t = 0; t <= 3; t++)
     for (int i = 0; i < NUM_ALLOCS; i++)
diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c
index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644
--- a/benchtests/bench-malloc-thread.c
+++ b/benchtests/bench-malloc-thread.c
@@ -225,7 +225,6 @@ main (int argc, char **argv)
 {
   timing_t cur;
   size_t iters = 0, num_threads = 1;
-  unsigned long res;
   json_ctx_t json_ctx;
   double d_total_s, d_total_i;
   struct sigaction act;
@@ -261,10 +260,6 @@ main (int argc, char **argv)
 
   json_attr_object_begin (&json_ctx, "");
 
-  TIMING_INIT (res);
-
-  (void) res;
-
   memset (&act, 0, sizeof (act));
   act.sa_handler = &alarm_handler;
 
diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -48,14 +48,11 @@ main (int argc, char **argv)
 
   memset (&runtime, 0, sizeof (runtime));
 
-  unsigned long iters, res;
+  unsigned long iters = 1000;
 
 #ifdef BENCH_INIT
   BENCH_INIT ();
 #endif
-  TIMING_INIT (res);
-
-  iters = 1000 * res;
 
   json_init (&json_ctx, 2, stdout);
 
diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c
index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644
--- a/benchtests/bench-strtod.c
+++ b/benchtests/bench-strtod.c
@@ -89,9 +89,6 @@ int
 do_bench (void)
 {
   const size_t iters = INNER_LOOP_ITERS;
-  timing_t res __attribute__ ((unused));
-
-  TIMING_INIT (res);
 
   for (size_t i = 0; inputs[i] != NULL; ++i)
     {
diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h
index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644
--- a/benchtests/bench-timing.h
+++ b/benchtests/bench-timing.h
@@ -28,8 +28,6 @@ typedef hp_timing_t timing_t;
 
 # define TIMING_TYPE "hp_timing"
 
-# define TIMING_INIT(res) ({ (res) = 1; })
-
 # define TIMING_NOW(var) HP_TIMING_NOW (var)
 # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end))
 # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff))
@@ -41,15 +39,6 @@ typedef uint64_t timing_t;
 
 # define TIMING_TYPE "clock_gettime"
 
-/* Measure the resolution of the clock so we can scale the number of
-   benchmark iterations by this value.  */
-# define TIMING_INIT(res) \
-({									      \
-  struct timespec start;						      \
-  clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start);			      \
-  (res) = start.tv_nsec;					      \
-})
-
 # define TIMING_NOW(var) \
 ({									      \
   struct timespec tv;							      \
Wilco Dijkstra March 18, 2019, 5:16 p.m. UTC | #4
ping
  

Hi Carlos,

> BUILDSTDERR: bench-malloc-simple.c: In function 'bench':
> BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable]
> BUILDSTDERR:    89 |   unsigned long res;
> BUILDSTDERR:       |                 ^~~
> BUILDSTDERR: cc1: all warnings being treated as errors
>
> Affects aarch64, armv7hl, and s390x.
> 
> I assume we need a "(void) res" like we have in bench-malloc-thread.c?
> 
> I'm going to checkin a quick fix to Rawhide and report back if anything
> else breaks.

Does that enable extra errors somehow? I can't reproduce it.

Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a
patch to just kill it:


Remove TIMING_INIT since it's only used in bench-skeleton.c if there
is no hp-timing support (which will become the default after [1]).

[1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html

ChangeLog:
2019-03-04  Wilco Dijkstra  <wdijkstr@arm.com>

        * benchtests/bench-malloc-simple.c: Remove TIMING_INIT.
        * benchtests/bench-malloc-thread.c: Likewise.
        * benchtests/bench-skeleton.c: Likewise.
        * benchtests/bench-strtod.c: Likewise.
        * benchtests/bench-timing.h: Likewise.

--

diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644
--- a/benchtests/bench-malloc-simple.c
+++ b/benchtests/bench-malloc-simple.c
@@ -86,9 +86,6 @@ bench (unsigned long size)
 {
   size_t iters = NUM_ITERS;
   int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
-  unsigned long res;
-
-  TIMING_INIT (res);
 
   for (int t = 0; t <= 3; t++)
     for (int i = 0; i < NUM_ALLOCS; i++)
diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c
index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644
--- a/benchtests/bench-malloc-thread.c
+++ b/benchtests/bench-malloc-thread.c
@@ -225,7 +225,6 @@ main (int argc, char **argv)
 {
   timing_t cur;
   size_t iters = 0, num_threads = 1;
-  unsigned long res;
   json_ctx_t json_ctx;
   double d_total_s, d_total_i;
   struct sigaction act;
@@ -261,10 +260,6 @@ main (int argc, char **argv)
 
   json_attr_object_begin (&json_ctx, "");
 
-  TIMING_INIT (res);
-
-  (void) res;
-
   memset (&act, 0, sizeof (act));
   act.sa_handler = &alarm_handler;
 
diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -48,14 +48,11 @@ main (int argc, char **argv)
 
   memset (&runtime, 0, sizeof (runtime));
 
-  unsigned long iters, res;
+  unsigned long iters = 1000;
 
 #ifdef BENCH_INIT
   BENCH_INIT ();
 #endif
-  TIMING_INIT (res);
-
-  iters = 1000 * res;
 
   json_init (&json_ctx, 2, stdout);
 
diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c
index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644
--- a/benchtests/bench-strtod.c
+++ b/benchtests/bench-strtod.c
@@ -89,9 +89,6 @@ int
 do_bench (void)
 {
   const size_t iters = INNER_LOOP_ITERS;
-  timing_t res __attribute__ ((unused));
-
-  TIMING_INIT (res);
 
   for (size_t i = 0; inputs[i] != NULL; ++i)
     {
diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h
index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644
--- a/benchtests/bench-timing.h
+++ b/benchtests/bench-timing.h
@@ -28,8 +28,6 @@ typedef hp_timing_t timing_t;
 
 # define TIMING_TYPE "hp_timing"
 
-# define TIMING_INIT(res) ({ (res) = 1; })
-
 # define TIMING_NOW(var) HP_TIMING_NOW (var)
 # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end))
 # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff))
@@ -41,15 +39,6 @@ typedef uint64_t timing_t;
 
 # define TIMING_TYPE "clock_gettime"
 
-/* Measure the resolution of the clock so we can scale the number of
-   benchmark iterations by this value.  */
-# define TIMING_INIT(res) \
-({                                                                           \
-  struct timespec start;                                                     \
-  clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start);                           \
-  (res) = start.tv_nsec;                                             \
-})
-
 # define TIMING_NOW(var) \
 ({                                                                            \
   struct timespec tv;                                                        \
Carlos O'Donell April 9, 2019, 5:25 a.m. UTC | #5
On 3/18/19 1:16 PM, Wilco Dijkstra wrote:
> ping
>    
> 
> Hi Carlos,
> 
>> BUILDSTDERR: bench-malloc-simple.c: In function 'bench':
>> BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable]
>> BUILDSTDERR:    89 |   unsigned long res;
>> BUILDSTDERR:       |                 ^~~
>> BUILDSTDERR: cc1: all warnings being treated as errors
>>
>> Affects aarch64, armv7hl, and s390x.
>>
>> I assume we need a "(void) res" like we have in bench-malloc-thread.c?
>>
>> I'm going to checkin a quick fix to Rawhide and report back if anything
>> else breaks.
> 
> Does that enable extra errors somehow? I can't reproduce it.
> 
> Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a
> patch to just kill it:

LGTM.

Sorry for the delay.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>

> 
> Remove TIMING_INIT since it's only used in bench-skeleton.c if there
> is no hp-timing support (which will become the default after [1]).
> 
> [1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html
> 
> ChangeLog:
> 2019-03-04  Wilco Dijkstra  <wdijkstr@arm.com>
> 
>          * benchtests/bench-malloc-simple.c: Remove TIMING_INIT.
>          * benchtests/bench-malloc-thread.c: Likewise.
>          * benchtests/bench-skeleton.c: Likewise.
>          * benchtests/bench-strtod.c: Likewise.
>          * benchtests/bench-timing.h: Likewise.
> 
> --
> 
> diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
> index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644
> --- a/benchtests/bench-malloc-simple.c
> +++ b/benchtests/bench-malloc-simple.c
> @@ -86,9 +86,6 @@ bench (unsigned long size)
>   {
>     size_t iters = NUM_ITERS;
>     int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
> -  unsigned long res;
> -
> -  TIMING_INIT (res);

OK.

>   
>     for (int t = 0; t <= 3; t++)
>       for (int i = 0; i < NUM_ALLOCS; i++)
> diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c
> index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644
> --- a/benchtests/bench-malloc-thread.c
> +++ b/benchtests/bench-malloc-thread.c
> @@ -225,7 +225,6 @@ main (int argc, char **argv)
>   {
>     timing_t cur;
>     size_t iters = 0, num_threads = 1;
> -  unsigned long res;

OK.

>     json_ctx_t json_ctx;
>     double d_total_s, d_total_i;
>     struct sigaction act;
> @@ -261,10 +260,6 @@ main (int argc, char **argv)
>   
>     json_attr_object_begin (&json_ctx, "");
>   
> -  TIMING_INIT (res);
> -
> -  (void) res;

OK.

> -
>     memset (&act, 0, sizeof (act));
>     act.sa_handler = &alarm_handler;
>   
> diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
> index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644
> --- a/benchtests/bench-skeleton.c
> +++ b/benchtests/bench-skeleton.c
> @@ -48,14 +48,11 @@ main (int argc, char **argv)
>   
>     memset (&runtime, 0, sizeof (runtime));
>   
> -  unsigned long iters, res;
> +  unsigned long iters = 1000;

OK. A fixed number of iterations will do.

>   
>   #ifdef BENCH_INIT
>     BENCH_INIT ();
>   #endif
> -  TIMING_INIT (res);
> -
> -  iters = 1000 * res;

OK.

>   
>     json_init (&json_ctx, 2, stdout);
>   
> diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c
> index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644
> --- a/benchtests/bench-strtod.c
> +++ b/benchtests/bench-strtod.c
> @@ -89,9 +89,6 @@ int
>   do_bench (void)
>   {
>     const size_t iters = INNER_LOOP_ITERS;
> -  timing_t res __attribute__ ((unused));
> -
> -  TIMING_INIT (res);

OK.

>   
>     for (size_t i = 0; inputs[i] != NULL; ++i)
>       {
> diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h
> index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644
> --- a/benchtests/bench-timing.h
> +++ b/benchtests/bench-timing.h
> @@ -28,8 +28,6 @@ typedef hp_timing_t timing_t;
>   
>   # define TIMING_TYPE "hp_timing"
>   
> -# define TIMING_INIT(res) ({ (res) = 1; })

OK.

> -
>   # define TIMING_NOW(var) HP_TIMING_NOW (var)
>   # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end))
>   # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff))
> @@ -41,15 +39,6 @@ typedef uint64_t timing_t;
>   
>   # define TIMING_TYPE "clock_gettime"
>   
> -/* Measure the resolution of the clock so we can scale the number of
> -   benchmark iterations by this value.  */
> -# define TIMING_INIT(res) \
> -({                                                                           \
> -  struct timespec start;                                                     \
> -  clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start);                           \
> -  (res) = start.tv_nsec;                                             \
> -})

OK.

> -
>   # define TIMING_NOW(var) \
>   ({                                                                            \
>     struct timespec tv;                                                        \
>      
>
diff mbox series

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 12036b1935dc7ea84b421f024d6fe3190ae35a6e..09f7cb8e475a312268eebb4d346edde70d22bb3d 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -90,7 +90,7 @@  CFLAGS-bench-trunc.c += -fno-builtin
 CFLAGS-bench-truncf.c += -fno-builtin
 
 ifeq (${BENCHSET},)
-bench-malloc := malloc-thread
+bench-malloc := malloc-thread malloc-simple
 else
 bench-malloc := $(filter malloc-%,${BENCHSET})
 endif
@@ -98,7 +98,7 @@  endif
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
 $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
-$(objpfx)bench-malloc-thread: $(shared-thread-library)
+$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library)
 
 
 
@@ -165,7 +165,7 @@  bench-clean:
 ifneq ($(strip ${BENCHSET}),)
 VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \
    wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \
-   malloc-thread
+   malloc-thread malloc-simple
 INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET})
 ifneq (${INVALIDBENCHSETNAMES},)
 $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES})
@@ -194,10 +194,18 @@  bench-set: $(binaries-benchset)
 
 bench-malloc: $(binaries-bench-malloc)
 	for run in $^; do \
+	  echo "$${run}"; \
+	  if [ `basename $${run}` = "bench-malloc-thread" ]; then \
 		for thr in 1 8 16 32; do \
 			echo "Running $${run} $${thr}"; \
-	  $(run-bench) $${thr} > $${run}-$${thr}.out; \
-	  done;\
+			$(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  else \
+		for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \
+		  echo "Running $${run} $${thr}"; \
+		  $(run-bench) $${thr} > $${run}-$${thr}.out; \
+		done;\
+	  fi;\
 	done
 
 # Build and execute the benchmark functions.  This target generates JSON
diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c
new file mode 100644
index 0000000000000000000000000000000000000000..995d78965fd65fdf1c84cf85bf38990cd49402b3
--- /dev/null
+++ b/benchtests/bench-malloc-simple.c
@@ -0,0 +1,182 @@ 
+/* Benchmark malloc and free functions.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <sys/resource.h>
+#include "bench-timing.h"
+#include "json-lib.h"
+
+#define NUM_ITERS 1000000
+#define NUM_ALLOCS 4
+#define MAX_ALLOCS 1600
+
+typedef struct
+{
+  size_t iters;
+  size_t size;
+  int n;
+  timing_t elapsed;
+} malloc_args;
+
+static void
+do_benchmark (malloc_args *args, int **arr)
+{
+  timing_t start, stop;
+  size_t iters = args->iters;
+  size_t size = args->size;
+  int n = args->n;
+
+  TIMING_NOW (start);
+
+  for (int j = 0; j < iters; j++)
+    {
+      for (int i = 0; i < n; i++)
+	arr[i] = malloc (size);
+
+      for (int i = 0; i < n; i++)
+	free (arr[i]);
+    }
+
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (args->elapsed, start, stop);
+}
+
+static malloc_args tests[3][NUM_ALLOCS];
+static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS };
+
+static void *
+thread_test (void *p)
+{
+  int **arr = (int**)p;
+
+  /* Run benchmark multi-threaded.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[2][i], arr);
+
+  return p;
+}
+
+void
+bench (unsigned long size)
+{
+  size_t iters = NUM_ITERS;
+  int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*));
+  unsigned long res;
+
+  TIMING_INIT (res);
+
+  for (int t = 0; t <= 3; t++)
+    for (int i = 0; i < NUM_ALLOCS; i++)
+      {
+	tests[t][i].n = allocs[i];
+	tests[t][i].size = size;
+	tests[t][i].iters = iters / allocs[i];
+
+	/* Do a quick warmup run.  */
+	if (t == 0)
+	  do_benchmark (&tests[0][i], arr);
+      }
+
+  /* Run benchmark single threaded in main_arena.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[0][i], arr);
+
+  /* Run benchmark in a thread_arena.  */
+  pthread_t t;
+  pthread_create (&t, NULL, thread_test, (void*)arr);
+  pthread_join (t, NULL);
+
+  /* Repeat benchmark in main_arena with SINGLE_THREAD_P == false.  */
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    do_benchmark (&tests[1][i], arr);
+
+  free (arr);
+
+  json_ctx_t json_ctx;
+
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  json_attr_object_begin (&json_ctx, "functions");
+
+  json_attr_object_begin (&json_ctx, "malloc");
+
+  char s[100];
+  double iters2 = iters;
+
+  json_attr_object_begin (&json_ctx, "");
+  json_attr_double (&json_ctx, "malloc_block_size", size);
+
+  struct rusage usage;
+  getrusage (RUSAGE_SELF, &usage);
+  json_attr_double (&json_ctx, "max_rss", usage.ru_maxrss);
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_st_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[0][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "main_arena_mt_allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[1][i].elapsed / iters2);
+    }
+
+  for (int i = 0; i < NUM_ALLOCS; i++)
+    {
+      sprintf (s, "thread_arena__allocs_%04d_time", allocs[i]);
+      json_attr_double (&json_ctx, s, tests[2][i].elapsed / iters2);
+    }
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_attr_object_end (&json_ctx);
+
+  json_document_end (&json_ctx);
+}
+
+static void usage (const char *name)
+{
+  fprintf (stderr, "%s: <alloc_size>\n", name);
+  exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+  long val = 16;
+  if (argc == 2)
+    val = strtol (argv[1], NULL, 0);
+
+  if (argc > 2 || val <= 0)
+    usage (argv[0]);
+
+  bench (val);
+
+  return 0;
+}