Message ID | DB5PR08MB1030A3EB64DA81DB604E140183920@DB5PR08MB1030.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | Add malloc micro benchmark | expand |
Hi DJ, > Looks good to me, although I'd like some additional comments in the test > code. Thanks for the review - I've added some extra comments: +/* Benchmark the malloc/free performance of a varying number of blocks of a + given size. This enables performance tracking of the t-cache and fastbins. + It tests 3 different scenarios: single-threaded using main arena, + multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P + false. */ > + else \ > + for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \ > + echo "Running $${run} $${thr}"; \ > + $(run-bench) $${thr} > $${run}-$${thr}.out; \ > + done;\ > + fi;\ > done > I wonder if this could be done more elegantly, but I'm OK with a simple > approach for now. If we end up adding many more such tests we might > need to revisit this part. The main concern was to get a clean state so that the test of a previous block size doesn't affect subsequent results. > +#define NUM_ITERS 1000000 > +#define NUM_ALLOCS 4 > +#define MAX_ALLOCS 1600 > How long does this test take to run, on average, compared to other > tests? Do we have to worry about increasing timeouts for slow hosts? All the tests together runs finish in a fraction of the time taken by a single test of bench-malloc-thread, so if anything we need to reduce the time of that one by an order of magnitude (it takes ~5 minutes!). > +static void > +do_benchmark (malloc_args *args, int **arr) > +{ > + timing_t start, stop; > + size_t iters = args->iters; > + size_t size = args->size; > + int n = args->n; > + > + TIMING_NOW (start); > + > + for (int j = 0; j < iters; j++) > + { > + for (int i = 0; i < n; i++) > + arr[i] = malloc (size); > + > + for (int i = 0; i < n; i++) > + free (arr[i]); > + } > + > + TIMING_NOW (stop); > + > + TIMING_DIFF (args->elapsed, start, stop); > +} > Simple loop, but doesn't test for malloc returning NULL. Yeah, the benchmark doesn't need to care since the amount we allocate is tiny (6.4MBytes). Cheers, Wilco I've committed this: Add a malloc micro benchmark to enable accurate testing of the various paths in malloc and free. The benchmark does a varying number of allocations of a given block size, then frees them again. It tests 3 different scenarios: single-threaded using main arena, multi-threaded using thread-arena, main arena with SINGLE_THREAD_P false. OK for commit? ChangeLog: 2019-02-14 Wilco Dijkstra <wdijkstr@arm.com> * benchtests/Makefile: Add malloc-simple benchmark. * benchtests/bench-malloc-simple.c: New benchmark. -- diff --git a/benchtests/Makefile b/benchtests/Makefile index 12036b1935dc7ea84b421f024d6fe3190ae35a6e..09f7cb8e475a312268eebb4d346edde70d22bb3d 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -90,7 +90,7 @@ CFLAGS-bench-trunc.c += -fno-builtin CFLAGS-bench-truncf.c += -fno-builtin ifeq (${BENCHSET},) -bench-malloc := malloc-thread +bench-malloc := malloc-thread malloc-simple else bench-malloc := $(filter malloc-%,${BENCHSET}) endif @@ -98,7 +98,7 @@ endif $(addprefix $(objpfx)bench-,$(bench-math)): $(libm) $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm) $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library) -$(objpfx)bench-malloc-thread: $(shared-thread-library) +$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library) @@ -165,7 +165,7 @@ bench-clean: ifneq ($(strip ${BENCHSET}),) VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \ wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \ - malloc-thread + malloc-thread malloc-simple INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET}) ifneq (${INVALIDBENCHSETNAMES},) $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES}) @@ -194,10 +194,18 @@ bench-set: $(binaries-benchset) bench-malloc: $(binaries-bench-malloc) for run in $^; do \ + echo "$${run}"; \ + if [ `basename $${run}` = "bench-malloc-thread" ]; then \ for thr in 1 8 16 32; do \ echo "Running $${run} $${thr}"; \ - $(run-bench) $${thr} > $${run}-$${thr}.out; \ - done;\ + $(run-bench) $${thr} > $${run}-$${thr}.out; \ + done;\ + else \ + for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \ + echo "Running $${run} $${thr}"; \ + $(run-bench) $${thr} > $${run}-$${thr}.out; \ + done;\ + fi;\ done # Build and execute the benchmark functions. This target generates JSON diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c new file mode 100644 index 0000000000000000000000000000000000000000..83203ff3187654a1710c9ef81016f854957b9d64 --- /dev/null +++ b/benchtests/bench-malloc-simple.c @@ -0,0 +1,188 @@ +/* Benchmark malloc and free functions. + Copyright (C) 2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <sys/resource.h> +#include "bench-timing.h" +#include "json-lib.h" + +/* Benchmark the malloc/free performance of a varying number of blocks of a + given size. This enables performance tracking of the t-cache and fastbins. + It tests 3 different scenarios: single-threaded using main arena, + multi-threaded using thread-arena, and main arena with SINGLE_THREAD_P + false. */ + +#define NUM_ITERS 200000 +#define NUM_ALLOCS 4 +#define MAX_ALLOCS 1600 + +typedef struct +{ + size_t iters; + size_t size; + int n; + timing_t elapsed; +} malloc_args; + +static void +do_benchmark (malloc_args *args, int **arr) +{ + timing_t start, stop; + size_t iters = args->iters; + size_t size = args->size; + int n = args->n; + + TIMING_NOW (start); + + for (int j = 0; j < iters; j++) + { + for (int i = 0; i < n; i++) + arr[i] = malloc (size); + + for (int i = 0; i < n; i++) + free (arr[i]); + } + + TIMING_NOW (stop); + + TIMING_DIFF (args->elapsed, start, stop); +} + +static malloc_args tests[3][NUM_ALLOCS]; +static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS }; + +static void * +thread_test (void *p) +{ + int **arr = (int**)p; + + /* Run benchmark multi-threaded. */ + for (int i = 0; i < NUM_ALLOCS; i++) + do_benchmark (&tests[2][i], arr); + + return p; +} + +void +bench (unsigned long size) +{ + size_t iters = NUM_ITERS; + int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*)); + unsigned long res; + + TIMING_INIT (res); + + for (int t = 0; t <= 3; t++) + for (int i = 0; i < NUM_ALLOCS; i++) + { + tests[t][i].n = allocs[i]; + tests[t][i].size = size; + tests[t][i].iters = iters / allocs[i]; + + /* Do a quick warmup run. */ + if (t == 0) + do_benchmark (&tests[0][i], arr); + } + + /* Run benchmark single threaded in main_arena. */ + for (int i = 0; i < NUM_ALLOCS; i++) + do_benchmark (&tests[0][i], arr); + + /* Run benchmark in a thread_arena. */ + pthread_t t; + pthread_create (&t, NULL, thread_test, (void*)arr); + pthread_join (t, NULL); + + /* Repeat benchmark in main_arena with SINGLE_THREAD_P == false. */ + for (int i = 0; i < NUM_ALLOCS; i++) + do_benchmark (&tests[1][i], arr); + + free (arr); + + json_ctx_t json_ctx; + + json_init (&json_ctx, 0, stdout); + + json_document_begin (&json_ctx); + + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE); + + json_attr_object_begin (&json_ctx, "functions"); + + json_attr_object_begin (&json_ctx, "malloc"); + + char s[100]; + double iters2 = iters; + + json_attr_object_begin (&json_ctx, ""); + json_attr_double (&json_ctx, "malloc_block_size", size); + + struct rusage usage; + getrusage (RUSAGE_SELF, &usage); + json_attr_double (&json_ctx, "max_rss", usage.ru_maxrss); + + for (int i = 0; i < NUM_ALLOCS; i++) + { + sprintf (s, "main_arena_st_allocs_%04d_time", allocs[i]); + json_attr_double (&json_ctx, s, tests[0][i].elapsed / iters2); + } + + for (int i = 0; i < NUM_ALLOCS; i++) + { + sprintf (s, "main_arena_mt_allocs_%04d_time", allocs[i]); + json_attr_double (&json_ctx, s, tests[1][i].elapsed / iters2); + } + + for (int i = 0; i < NUM_ALLOCS; i++) + { + sprintf (s, "thread_arena__allocs_%04d_time", allocs[i]); + json_attr_double (&json_ctx, s, tests[2][i].elapsed / iters2); + } + + json_attr_object_end (&json_ctx); + + json_attr_object_end (&json_ctx); + + json_attr_object_end (&json_ctx); + + json_document_end (&json_ctx); +} + +static void usage (const char *name) +{ + fprintf (stderr, "%s: <alloc_size>\n", name); + exit (1); +} + +int +main (int argc, char **argv) +{ + long val = 16; + if (argc == 2) + val = strtol (argv[1], NULL, 0); + + if (argc > 2 || val <= 0) + usage (argv[0]); + + bench (val); + + return 0; +}
On 2/1/19 11:27 AM, Wilco Dijkstra wrote: > Add a malloc micro benchmark to enable accurate testing of the > various paths in malloc and free. The benchmark does a varying > number of allocations of a given block size, then frees them again. > > It tests 3 different scenarios: single-threaded using main arena, > multi-threaded using thread-arena, main arena with SINGLE_THREAD_P > false. > > OK for commit? > > ChangeLog: > 2019-02-01 Wilco Dijkstra <wdijkstr@arm.com> > > * benchtests/Makefile: Add malloc-simple benchmark. > * benchtests/bench-malloc-simple.c: New benchmark. This broke Fedora Rawhide during CI testing: BUILDSTDERR: bench-malloc-simple.c: In function 'bench': BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable] BUILDSTDERR: 89 | unsigned long res; BUILDSTDERR: | ^~~ BUILDSTDERR: cc1: all warnings being treated as errors Affects aarch64, armv7hl, and s390x. I assume we need a "(void) res" like we have in bench-malloc-thread.c? I'm going to checkin a quick fix to Rawhide and report back if anything else breaks.
Hi Carlos, > BUILDSTDERR: bench-malloc-simple.c: In function 'bench': > BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable] > BUILDSTDERR: 89 | unsigned long res; > BUILDSTDERR: | ^~~ > BUILDSTDERR: cc1: all warnings being treated as errors > > Affects aarch64, armv7hl, and s390x. > > I assume we need a "(void) res" like we have in bench-malloc-thread.c? > > I'm going to checkin a quick fix to Rawhide and report back if anything > else breaks. Does that enable extra errors somehow? I can't reproduce it. Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a patch to just kill it: Remove TIMING_INIT since it's only used in bench-skeleton.c if there is no hp-timing support (which will become the default after [1]). [1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html ChangeLog: 2019-03-04 Wilco Dijkstra <wdijkstr@arm.com> * benchtests/bench-malloc-simple.c: Remove TIMING_INIT. * benchtests/bench-malloc-thread.c: Likewise. * benchtests/bench-skeleton.c: Likewise. * benchtests/bench-strtod.c: Likewise. * benchtests/bench-timing.h: Likewise. -- diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644 --- a/benchtests/bench-malloc-simple.c +++ b/benchtests/bench-malloc-simple.c @@ -86,9 +86,6 @@ bench (unsigned long size) { size_t iters = NUM_ITERS; int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*)); - unsigned long res; - - TIMING_INIT (res); for (int t = 0; t <= 3; t++) for (int i = 0; i < NUM_ALLOCS; i++) diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644 --- a/benchtests/bench-malloc-thread.c +++ b/benchtests/bench-malloc-thread.c @@ -225,7 +225,6 @@ main (int argc, char **argv) { timing_t cur; size_t iters = 0, num_threads = 1; - unsigned long res; json_ctx_t json_ctx; double d_total_s, d_total_i; struct sigaction act; @@ -261,10 +260,6 @@ main (int argc, char **argv) json_attr_object_begin (&json_ctx, ""); - TIMING_INIT (res); - - (void) res; - memset (&act, 0, sizeof (act)); act.sa_handler = &alarm_handler; diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644 --- a/benchtests/bench-skeleton.c +++ b/benchtests/bench-skeleton.c @@ -48,14 +48,11 @@ main (int argc, char **argv) memset (&runtime, 0, sizeof (runtime)); - unsigned long iters, res; + unsigned long iters = 1000; #ifdef BENCH_INIT BENCH_INIT (); #endif - TIMING_INIT (res); - - iters = 1000 * res; json_init (&json_ctx, 2, stdout); diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644 --- a/benchtests/bench-strtod.c +++ b/benchtests/bench-strtod.c @@ -89,9 +89,6 @@ int do_bench (void) { const size_t iters = INNER_LOOP_ITERS; - timing_t res __attribute__ ((unused)); - - TIMING_INIT (res); for (size_t i = 0; inputs[i] != NULL; ++i) { diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644 --- a/benchtests/bench-timing.h +++ b/benchtests/bench-timing.h @@ -28,8 +28,6 @@ typedef hp_timing_t timing_t; # define TIMING_TYPE "hp_timing" -# define TIMING_INIT(res) ({ (res) = 1; }) - # define TIMING_NOW(var) HP_TIMING_NOW (var) # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end)) # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff)) @@ -41,15 +39,6 @@ typedef uint64_t timing_t; # define TIMING_TYPE "clock_gettime" -/* Measure the resolution of the clock so we can scale the number of - benchmark iterations by this value. */ -# define TIMING_INIT(res) \ -({ \ - struct timespec start; \ - clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start); \ - (res) = start.tv_nsec; \ -}) - # define TIMING_NOW(var) \ ({ \ struct timespec tv; \
ping Hi Carlos, > BUILDSTDERR: bench-malloc-simple.c: In function 'bench': > BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable] > BUILDSTDERR: 89 | unsigned long res; > BUILDSTDERR: | ^~~ > BUILDSTDERR: cc1: all warnings being treated as errors > > Affects aarch64, armv7hl, and s390x. > > I assume we need a "(void) res" like we have in bench-malloc-thread.c? > > I'm going to checkin a quick fix to Rawhide and report back if anything > else breaks. Does that enable extra errors somehow? I can't reproduce it. Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a patch to just kill it: Remove TIMING_INIT since it's only used in bench-skeleton.c if there is no hp-timing support (which will become the default after [1]). [1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html ChangeLog: 2019-03-04 Wilco Dijkstra <wdijkstr@arm.com> * benchtests/bench-malloc-simple.c: Remove TIMING_INIT. * benchtests/bench-malloc-thread.c: Likewise. * benchtests/bench-skeleton.c: Likewise. * benchtests/bench-strtod.c: Likewise. * benchtests/bench-timing.h: Likewise. -- diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644 --- a/benchtests/bench-malloc-simple.c +++ b/benchtests/bench-malloc-simple.c @@ -86,9 +86,6 @@ bench (unsigned long size) { size_t iters = NUM_ITERS; int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*)); - unsigned long res; - - TIMING_INIT (res); for (int t = 0; t <= 3; t++) for (int i = 0; i < NUM_ALLOCS; i++) diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644 --- a/benchtests/bench-malloc-thread.c +++ b/benchtests/bench-malloc-thread.c @@ -225,7 +225,6 @@ main (int argc, char **argv) { timing_t cur; size_t iters = 0, num_threads = 1; - unsigned long res; json_ctx_t json_ctx; double d_total_s, d_total_i; struct sigaction act; @@ -261,10 +260,6 @@ main (int argc, char **argv) json_attr_object_begin (&json_ctx, ""); - TIMING_INIT (res); - - (void) res; - memset (&act, 0, sizeof (act)); act.sa_handler = &alarm_handler; diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644 --- a/benchtests/bench-skeleton.c +++ b/benchtests/bench-skeleton.c @@ -48,14 +48,11 @@ main (int argc, char **argv) memset (&runtime, 0, sizeof (runtime)); - unsigned long iters, res; + unsigned long iters = 1000; #ifdef BENCH_INIT BENCH_INIT (); #endif - TIMING_INIT (res); - - iters = 1000 * res; json_init (&json_ctx, 2, stdout); diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644 --- a/benchtests/bench-strtod.c +++ b/benchtests/bench-strtod.c @@ -89,9 +89,6 @@ int do_bench (void) { const size_t iters = INNER_LOOP_ITERS; - timing_t res __attribute__ ((unused)); - - TIMING_INIT (res); for (size_t i = 0; inputs[i] != NULL; ++i) { diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644 --- a/benchtests/bench-timing.h +++ b/benchtests/bench-timing.h @@ -28,8 +28,6 @@ typedef hp_timing_t timing_t; # define TIMING_TYPE "hp_timing" -# define TIMING_INIT(res) ({ (res) = 1; }) - # define TIMING_NOW(var) HP_TIMING_NOW (var) # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end)) # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff)) @@ -41,15 +39,6 @@ typedef uint64_t timing_t; # define TIMING_TYPE "clock_gettime" -/* Measure the resolution of the clock so we can scale the number of - benchmark iterations by this value. */ -# define TIMING_INIT(res) \ -({ \ - struct timespec start; \ - clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start); \ - (res) = start.tv_nsec; \ -}) - # define TIMING_NOW(var) \ ({ \ struct timespec tv; \
On 3/18/19 1:16 PM, Wilco Dijkstra wrote: > ping > > > Hi Carlos, > >> BUILDSTDERR: bench-malloc-simple.c: In function 'bench': >> BUILDSTDERR: bench-malloc-simple.c:89:17: error: variable 'res' set but not used [-Werror=unused-but-set-variable] >> BUILDSTDERR: 89 | unsigned long res; >> BUILDSTDERR: | ^~~ >> BUILDSTDERR: cc1: all warnings being treated as errors >> >> Affects aarch64, armv7hl, and s390x. >> >> I assume we need a "(void) res" like we have in bench-malloc-thread.c? >> >> I'm going to checkin a quick fix to Rawhide and report back if anything >> else breaks. > > Does that enable extra errors somehow? I can't reproduce it. > > Anyway TIMING_INIT is redundant for bench-malloc-*.c, so here's a > patch to just kill it: LGTM. Sorry for the delay. Reviewed-by: Carlos O'Donell <carlos@redhat.com> > > Remove TIMING_INIT since it's only used in bench-skeleton.c if there > is no hp-timing support (which will become the default after [1]). > > [1] https://sourceware.org/ml/libc-alpha/2019-02/msg00468.html > > ChangeLog: > 2019-03-04 Wilco Dijkstra <wdijkstr@arm.com> > > * benchtests/bench-malloc-simple.c: Remove TIMING_INIT. > * benchtests/bench-malloc-thread.c: Likewise. > * benchtests/bench-skeleton.c: Likewise. > * benchtests/bench-strtod.c: Likewise. > * benchtests/bench-timing.h: Likewise. > > -- > > diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c > index 83203ff3187654a1710c9ef81016f854957b9d64..b8bb2cc116953c6691c17633d18c5661c7d9243e 100644 > --- a/benchtests/bench-malloc-simple.c > +++ b/benchtests/bench-malloc-simple.c > @@ -86,9 +86,6 @@ bench (unsigned long size) > { > size_t iters = NUM_ITERS; > int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*)); > - unsigned long res; > - > - TIMING_INIT (res); OK. > > for (int t = 0; t <= 3; t++) > for (int i = 0; i < NUM_ALLOCS; i++) > diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c > index bb4ba727a88059ecbe7305f5b8ad1693c1f1f266..52261425b0f1af32c17328ea5e0a5bb6f230df47 100644 > --- a/benchtests/bench-malloc-thread.c > +++ b/benchtests/bench-malloc-thread.c > @@ -225,7 +225,6 @@ main (int argc, char **argv) > { > timing_t cur; > size_t iters = 0, num_threads = 1; > - unsigned long res; OK. > json_ctx_t json_ctx; > double d_total_s, d_total_i; > struct sigaction act; > @@ -261,10 +260,6 @@ main (int argc, char **argv) > > json_attr_object_begin (&json_ctx, ""); > > - TIMING_INIT (res); > - > - (void) res; OK. > - > memset (&act, 0, sizeof (act)); > act.sa_handler = &alarm_handler; > > diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c > index 37625c4296882268f6260d99adbc7f0295164ffc..854151e5a82028e74fe3a966e82004572542f411 100644 > --- a/benchtests/bench-skeleton.c > +++ b/benchtests/bench-skeleton.c > @@ -48,14 +48,11 @@ main (int argc, char **argv) > > memset (&runtime, 0, sizeof (runtime)); > > - unsigned long iters, res; > + unsigned long iters = 1000; OK. A fixed number of iterations will do. > > #ifdef BENCH_INIT > BENCH_INIT (); > #endif > - TIMING_INIT (res); > - > - iters = 1000 * res; OK. > > json_init (&json_ctx, 2, stdout); > > diff --git a/benchtests/bench-strtod.c b/benchtests/bench-strtod.c > index 4de0b9acb67eb925a80249322957ce8b3c08c8d6..d5b2503553ef74f33cace919ae9c62f79cd11c9c 100644 > --- a/benchtests/bench-strtod.c > +++ b/benchtests/bench-strtod.c > @@ -89,9 +89,6 @@ int > do_bench (void) > { > const size_t iters = INNER_LOOP_ITERS; > - timing_t res __attribute__ ((unused)); > - > - TIMING_INIT (res); OK. > > for (size_t i = 0; inputs[i] != NULL; ++i) > { > diff --git a/benchtests/bench-timing.h b/benchtests/bench-timing.h > index 41b7324527b9deed67b3479cb1308fbd291bc5ca..f9b19fcd29efb45ea02c375e37caba94c93956d1 100644 > --- a/benchtests/bench-timing.h > +++ b/benchtests/bench-timing.h > @@ -28,8 +28,6 @@ typedef hp_timing_t timing_t; > > # define TIMING_TYPE "hp_timing" > > -# define TIMING_INIT(res) ({ (res) = 1; }) OK. > - > # define TIMING_NOW(var) HP_TIMING_NOW (var) > # define TIMING_DIFF(diff, start, end) HP_TIMING_DIFF ((diff), (start), (end)) > # define TIMING_ACCUM(sum, diff) HP_TIMING_ACCUM_NT ((sum), (diff)) > @@ -41,15 +39,6 @@ typedef uint64_t timing_t; > > # define TIMING_TYPE "clock_gettime" > > -/* Measure the resolution of the clock so we can scale the number of > - benchmark iterations by this value. */ > -# define TIMING_INIT(res) \ > -({ \ > - struct timespec start; \ > - clock_getres (CLOCK_PROCESS_CPUTIME_ID, &start); \ > - (res) = start.tv_nsec; \ > -}) OK. > - > # define TIMING_NOW(var) \ > ({ \ > struct timespec tv; \ > >
diff --git a/benchtests/Makefile b/benchtests/Makefile index 12036b1935dc7ea84b421f024d6fe3190ae35a6e..09f7cb8e475a312268eebb4d346edde70d22bb3d 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -90,7 +90,7 @@ CFLAGS-bench-trunc.c += -fno-builtin CFLAGS-bench-truncf.c += -fno-builtin ifeq (${BENCHSET},) -bench-malloc := malloc-thread +bench-malloc := malloc-thread malloc-simple else bench-malloc := $(filter malloc-%,${BENCHSET}) endif @@ -98,7 +98,7 @@ endif $(addprefix $(objpfx)bench-,$(bench-math)): $(libm) $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm) $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library) -$(objpfx)bench-malloc-thread: $(shared-thread-library) +$(addprefix $(objpfx)bench-,$(bench-malloc)): $(shared-thread-library) @@ -165,7 +165,7 @@ bench-clean: ifneq ($(strip ${BENCHSET}),) VALIDBENCHSETNAMES := bench-pthread bench-math bench-string string-benchset \ wcsmbs-benchset stdlib-benchset stdio-common-benchset math-benchset \ - malloc-thread + malloc-thread malloc-simple INVALIDBENCHSETNAMES := $(filter-out ${VALIDBENCHSETNAMES},${BENCHSET}) ifneq (${INVALIDBENCHSETNAMES},) $(info The following values in BENCHSET are invalid: ${INVALIDBENCHSETNAMES}) @@ -194,10 +194,18 @@ bench-set: $(binaries-benchset) bench-malloc: $(binaries-bench-malloc) for run in $^; do \ + echo "$${run}"; \ + if [ `basename $${run}` = "bench-malloc-thread" ]; then \ for thr in 1 8 16 32; do \ echo "Running $${run} $${thr}"; \ - $(run-bench) $${thr} > $${run}-$${thr}.out; \ - done;\ + $(run-bench) $${thr} > $${run}-$${thr}.out; \ + done;\ + else \ + for thr in 8 16 32 64 128 256 512 1024 2048 4096; do \ + echo "Running $${run} $${thr}"; \ + $(run-bench) $${thr} > $${run}-$${thr}.out; \ + done;\ + fi;\ done # Build and execute the benchmark functions. This target generates JSON diff --git a/benchtests/bench-malloc-simple.c b/benchtests/bench-malloc-simple.c new file mode 100644 index 0000000000000000000000000000000000000000..995d78965fd65fdf1c84cf85bf38990cd49402b3 --- /dev/null +++ b/benchtests/bench-malloc-simple.c @@ -0,0 +1,182 @@ +/* Benchmark malloc and free functions. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <sys/resource.h> +#include "bench-timing.h" +#include "json-lib.h" + +#define NUM_ITERS 1000000 +#define NUM_ALLOCS 4 +#define MAX_ALLOCS 1600 + +typedef struct +{ + size_t iters; + size_t size; + int n; + timing_t elapsed; +} malloc_args; + +static void +do_benchmark (malloc_args *args, int **arr) +{ + timing_t start, stop; + size_t iters = args->iters; + size_t size = args->size; + int n = args->n; + + TIMING_NOW (start); + + for (int j = 0; j < iters; j++) + { + for (int i = 0; i < n; i++) + arr[i] = malloc (size); + + for (int i = 0; i < n; i++) + free (arr[i]); + } + + TIMING_NOW (stop); + + TIMING_DIFF (args->elapsed, start, stop); +} + +static malloc_args tests[3][NUM_ALLOCS]; +static int allocs[NUM_ALLOCS] = { 25, 100, 400, MAX_ALLOCS }; + +static void * +thread_test (void *p) +{ + int **arr = (int**)p; + + /* Run benchmark multi-threaded. */ + for (int i = 0; i < NUM_ALLOCS; i++) + do_benchmark (&tests[2][i], arr); + + return p; +} + +void +bench (unsigned long size) +{ + size_t iters = NUM_ITERS; + int **arr = (int**) malloc (MAX_ALLOCS * sizeof (void*)); + unsigned long res; + + TIMING_INIT (res); + + for (int t = 0; t <= 3; t++) + for (int i = 0; i < NUM_ALLOCS; i++) + { + tests[t][i].n = allocs[i]; + tests[t][i].size = size; + tests[t][i].iters = iters / allocs[i]; + + /* Do a quick warmup run. */ + if (t == 0) + do_benchmark (&tests[0][i], arr); + } + + /* Run benchmark single threaded in main_arena. */ + for (int i = 0; i < NUM_ALLOCS; i++) + do_benchmark (&tests[0][i], arr); + + /* Run benchmark in a thread_arena. */ + pthread_t t; + pthread_create (&t, NULL, thread_test, (void*)arr); + pthread_join (t, NULL); + + /* Repeat benchmark in main_arena with SINGLE_THREAD_P == false. */ + for (int i = 0; i < NUM_ALLOCS; i++) + do_benchmark (&tests[1][i], arr); + + free (arr); + + json_ctx_t json_ctx; + + json_init (&json_ctx, 0, stdout); + + json_document_begin (&json_ctx); + + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE); + + json_attr_object_begin (&json_ctx, "functions"); + + json_attr_object_begin (&json_ctx, "malloc"); + + char s[100]; + double iters2 = iters; + + json_attr_object_begin (&json_ctx, ""); + json_attr_double (&json_ctx, "malloc_block_size", size); + + struct rusage usage; + getrusage (RUSAGE_SELF, &usage); + json_attr_double (&json_ctx, "max_rss", usage.ru_maxrss); + + for (int i = 0; i < NUM_ALLOCS; i++) + { + sprintf (s, "main_arena_st_allocs_%04d_time", allocs[i]); + json_attr_double (&json_ctx, s, tests[0][i].elapsed / iters2); + } + + for (int i = 0; i < NUM_ALLOCS; i++) + { + sprintf (s, "main_arena_mt_allocs_%04d_time", allocs[i]); + json_attr_double (&json_ctx, s, tests[1][i].elapsed / iters2); + } + + for (int i = 0; i < NUM_ALLOCS; i++) + { + sprintf (s, "thread_arena__allocs_%04d_time", allocs[i]); + json_attr_double (&json_ctx, s, tests[2][i].elapsed / iters2); + } + + json_attr_object_end (&json_ctx); + + json_attr_object_end (&json_ctx); + + json_attr_object_end (&json_ctx); + + json_document_end (&json_ctx); +} + +static void usage (const char *name) +{ + fprintf (stderr, "%s: <alloc_size>\n", name); + exit (1); +} + +int +main (int argc, char **argv) +{ + long val = 16; + if (argc == 2) + val = strtol (argv[1], NULL, 0); + + if (argc > 2 || val <= 0) + usage (argv[0]); + + bench (val); + + return 0; +}