Message ID | 20220421032841.3004316-1-wangyang.guo@intel.com |
---|---|
State | New |
Headers | show |
Series | [v2] benchtests: Add pthread-mutex-locks bench | expand |
On Wed, Apr 20, 2022 at 10:29 PM Wangyang Guo <wangyang.guo@intel.com> wrote: > > Benchmark for testing pthread mutex locks performance with different > threads and critical sections. > > The test configuration consists of 3 parts: > 1. thread number > 2. critical-section length > 3. non-critical-section length > > Thread number starts from 1 and increased by 2x until num of CPU cores > (nprocs). An additional over-saturation case (1.25 * nprocs) is also > included. > Critical-section is represented by a loop of shared do_filler(), > length can be determined by the loop iters. > Non-critical-section is similiar to the critical-section, except it's > based on non-shared do_filler(). > > Currently, adaptive pthread_mutex lock is tested. > > v2: Fix benchout json schema validation error. > --- > benchtests/Makefile | 2 + > benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++ > 2 files changed, 290 insertions(+) > create mode 100644 benchtests/bench-pthread-mutex-locks.c > > diff --git a/benchtests/Makefile b/benchtests/Makefile > index 8dfca592fd..b477042e6c 100644 > --- a/benchtests/Makefile > +++ b/benchtests/Makefile > @@ -102,6 +102,7 @@ endif > > bench-pthread := \ > pthread-locks \ > + pthread-mutex-locks \ > pthread_once \ > thread_create \ > # bench-pthread > @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests) > $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests) > $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests) > $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests) > +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests) > > > > diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c > new file mode 100644 > index 0000000000..e934b0001a > --- /dev/null > +++ b/benchtests/bench-pthread-mutex-locks.c > @@ -0,0 +1,288 @@ > +/* Measure mutex_lock for different threads and critical sections. > + Copyright (C) 2020-2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define TEST_MAIN > +#define TEST_NAME "pthread-mutex-locks" > +#define TIMEOUT (20 * 60) > + > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <unistd.h> > +#include <math.h> > +#include <pthread.h> > +#include <sys/time.h> > +#include <sys/sysinfo.h> > +#include "bench-timing.h" > +#include "json-lib.h" > + > +static pthread_mutex_t lock; > +static pthread_mutexattr_t attr; > +static pthread_barrier_t barrier; > + > +#define START_ITERS 1000 > + > +#pragma GCC push_options > +#pragma GCC optimize(1) > + > +static int __attribute__ ((noinline)) fibonacci (int i) > +{ > + asm(""); > + if (i > 2) > + return fibonacci (i - 1) + fibonacci (i - 2); > + return 10 + i; > +} > + > +static void > +do_filler (void) > +{ > + char buf1[512], buf2[512]; > + int f = fibonacci (4); > + memcpy (buf1, buf2, f); > +} > + > +static void > +do_filler_shared (void) > +{ > + static char buf1[512], buf2[512]; > + int f = fibonacci (4); > + memcpy (buf1, buf2, f); > +} > + > +#pragma GCC pop_options > + > +#define UNIT_WORK_CRT do_filler_shared () > +#define UNIT_WORK_NON_CRT do_filler () > + > +static inline void > +critical_section (int length) > +{ > + for (int i = length; i >= 0; i--) > + UNIT_WORK_CRT; > +} > + > +static inline void > +non_critical_section (int length) > +{ > + for (int i = length; i >= 0; i--) > + UNIT_WORK_NON_CRT; > +} > + > +typedef struct Worker_Params > +{ > + long iters; > + int crt_len; > + int non_crt_len; > + timing_t duration; > +} Worker_Params; > + > +static void * > +worker (void *v) > +{ > + timing_t start, stop; > + Worker_Params *p = (Worker_Params *) v; > + long iters = p->iters; > + int crt_len = p->crt_len; > + int non_crt_len = p->non_crt_len; > + > + pthread_barrier_wait (&barrier); > + TIMING_NOW (start); > + while (iters--) > + { > + pthread_mutex_lock (&lock); > + critical_section (crt_len); > + pthread_mutex_unlock (&lock); > + non_critical_section (non_crt_len); > + } > + TIMING_NOW (stop); > + > + TIMING_DIFF (p->duration, start, stop); > + return NULL; > +} > + > +static double > +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) > +{ > + int i; > + timing_t mean; > + Worker_Params *p, params[num_threads]; > + pthread_t threads[num_threads]; > + > + pthread_mutex_init (&lock, &attr); > + pthread_barrier_init (&barrier, NULL, num_threads); > + > + for (i = 0; i < num_threads; i++) > + { > + p = ¶ms[i]; > + p->iters = iters; > + p->crt_len = crt_len; > + p->non_crt_len = non_crt_len; > + pthread_create (&threads[i], NULL, worker, (void *) p); > + } > + for (i = 0; i < num_threads; i++) > + pthread_join (threads[i], NULL); > + > + pthread_mutex_destroy (&lock); > + pthread_barrier_destroy (&barrier); > + > + mean = 0; > + for (i = 0; i < num_threads; i++) > + mean += params[i].duration; > + mean /= num_threads; > + return mean; > +} > + > +#define RUN_COUNT 10 > +#define MIN_TEST_SEC 0.01 > + > +static void > +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len, > + json_ctx_t *js) > +{ > + timing_t cur; > + struct timeval ts, te; > + double tsd, ted, td; > + long iters, iters_limit, total_iters; > + timing_t curs[RUN_COUNT + 2]; > + int i, j; > + double mean, stdev; > + > + iters = START_ITERS; > + iters_limit = LONG_MAX / 100; > + > + while (1) > + { > + gettimeofday (&ts, NULL); > + cur = do_one_test (num_threads, crt_len, non_crt_len, iters); > + gettimeofday (&te, NULL); > + /* Make sure the test to run at least MIN_TEST_SEC. */ > + tsd = ts.tv_sec + ts.tv_usec / 1000000.0; > + ted = te.tv_sec + te.tv_usec / 1000000.0; > + td = ted - tsd; > + if (td >= MIN_TEST_SEC || iters >= iters_limit) > + break; > + > + iters *= 10; > + } > + > + curs[0] = cur; > + for (i = 1; i < RUN_COUNT + 2; i++) > + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters); > + > + /* Sort the results so we can discard the fastest and slowest > + times as outliers. */ > + for (i = 0; i < RUN_COUNT + 1; i++) > + for (j = i + 1; j < RUN_COUNT + 2; j++) > + if (curs[i] > curs[j]) > + { > + timing_t temp = curs[i]; > + curs[i] = curs[j]; > + curs[j] = temp; > + } > + > + /* Calculate mean and standard deviation. */ > + mean = 0.0; > + total_iters = iters * num_threads; > + for (i = 1; i < RUN_COUNT + 1; i++) > + mean += (double) curs[i] / (double) total_iters; > + mean /= RUN_COUNT; > + > + stdev = 0.0; > + for (i = 1; i < RUN_COUNT + 1; i++) > + { > + double s = (double) curs[i] / (double) total_iters - mean; > + stdev += s * s; > + } > + stdev = sqrt (stdev / (RUN_COUNT - 1)); > + > + char buf[256]; > + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name, > + non_crt_len, crt_len, num_threads); > + > + json_attr_object_begin (js, buf); > + > + json_attr_double (js, "duration", (double) cur); > + json_attr_double (js, "iterations", (double) total_iters); > + json_attr_double (js, "mean", mean); > + json_attr_double (js, "stdev", stdev); > + json_attr_double (js, "min-outlier", > + (double) curs[0] / (double) total_iters); > + json_attr_double (js, "min", (double) curs[1] / (double) total_iters); > + json_attr_double (js, "max", > + (double) curs[RUN_COUNT] / (double) total_iters); > + json_attr_double (js, "max-outlier", > + (double) curs[RUN_COUNT + 1] / (double) total_iters); > + > + json_attr_object_end (js); > +} > + > +#define TH_CONF_MAX 10 > + > +int > +do_bench (void) > +{ > + int rv = 0; > + json_ctx_t json_ctx; > + int i, j, k; > + int th_num, th_conf, nprocs; > + int threads[TH_CONF_MAX]; > + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 }; > + int non_crt_lens[] = { 1, 32, 128 }; > + char name[128]; > + > + json_init (&json_ctx, 2, stdout); > + json_attr_object_begin (&json_ctx, "pthread_mutex_locks"); > + > + /* The thread config begins from 1, and increases by 2x until nprocs. > + We also wants to test over-saturation case (1.25*nprocs). */ > + nprocs = get_nprocs (); > + th_num = 1; > + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++) > + { > + threads[th_conf] = th_num; > + th_num <<= 1; > + } > + threads[th_conf++] = nprocs; > + threads[th_conf++] = nprocs + nprocs / 4; > + > + pthread_mutexattr_init (&attr); > + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP); > + snprintf (name, sizeof name, "type=adaptive"); > + > + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++) > + { > + int non_crt_len = non_crt_lens[k]; > + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++) > + { > + int crt_len = crt_lens[j]; > + for (i = 0; i < th_conf; i++) > + { > + th_num = threads[i]; > + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx); > + } > + } > + } > + > + json_attr_object_end (&json_ctx); > + > + return rv; > +} > + > +#define TEST_FUNCTION do_bench () > + > +#include "../test-skeleton.c" > -- > 2.35.1 > Can you run clang-format on this? Otherwise LGTM.
On 4/21/2022 9:13 PM, Noah Goldstein via Libc-alpha wrote: > On Wed, Apr 20, 2022 at 10:29 PM Wangyang Guo <wangyang.guo@intel.com> wrote: >> >> Benchmark for testing pthread mutex locks performance with different >> threads and critical sections. >> >> The test configuration consists of 3 parts: >> 1. thread number >> 2. critical-section length >> 3. non-critical-section length >> >> Thread number starts from 1 and increased by 2x until num of CPU cores >> (nprocs). An additional over-saturation case (1.25 * nprocs) is also >> included. >> Critical-section is represented by a loop of shared do_filler(), >> length can be determined by the loop iters. >> Non-critical-section is similiar to the critical-section, except it's >> based on non-shared do_filler(). >> >> Currently, adaptive pthread_mutex lock is tested. >> >> v2: Fix benchout json schema validation error. >> --- >> benchtests/Makefile | 2 + >> benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++ >> 2 files changed, 290 insertions(+) >> create mode 100644 benchtests/bench-pthread-mutex-locks.c >> >> diff --git a/benchtests/Makefile b/benchtests/Makefile >> index 8dfca592fd..b477042e6c 100644 >> --- a/benchtests/Makefile >> +++ b/benchtests/Makefile >> @@ -102,6 +102,7 @@ endif >> >> bench-pthread := \ >> pthread-locks \ >> + pthread-mutex-locks \ >> pthread_once \ >> thread_create \ >> # bench-pthread >> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests) >> $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests) >> $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests) >> $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests) >> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests) >> >> >> >> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c >> new file mode 100644 >> index 0000000000..e934b0001a >> --- /dev/null >> +++ b/benchtests/bench-pthread-mutex-locks.c >> @@ -0,0 +1,288 @@ >> +/* Measure mutex_lock for different threads and critical sections. >> + Copyright (C) 2020-2022 Free Software Foundation, Inc. >> + This file is part of the GNU C Library. >> + >> + The GNU C Library is free software; you can redistribute it and/or >> + modify it under the terms of the GNU Lesser General Public >> + License as published by the Free Software Foundation; either >> + version 2.1 of the License, or (at your option) any later version. >> + >> + The GNU C Library is distributed in the hope that it will be useful, >> + but WITHOUT ANY WARRANTY; without even the implied warranty of >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + Lesser General Public License for more details. >> + >> + You should have received a copy of the GNU Lesser General Public >> + License along with the GNU C Library; if not, see >> + <https://www.gnu.org/licenses/>. */ >> + >> +#define TEST_MAIN >> +#define TEST_NAME "pthread-mutex-locks" >> +#define TIMEOUT (20 * 60) >> + >> +#include <stdio.h> >> +#include <stdlib.h> >> +#include <string.h> >> +#include <unistd.h> >> +#include <math.h> >> +#include <pthread.h> >> +#include <sys/time.h> >> +#include <sys/sysinfo.h> >> +#include "bench-timing.h" >> +#include "json-lib.h" >> + >> +static pthread_mutex_t lock; >> +static pthread_mutexattr_t attr; >> +static pthread_barrier_t barrier; >> + >> +#define START_ITERS 1000 >> + >> +#pragma GCC push_options >> +#pragma GCC optimize(1) >> + >> +static int __attribute__ ((noinline)) fibonacci (int i) >> +{ >> + asm(""); >> + if (i > 2) >> + return fibonacci (i - 1) + fibonacci (i - 2); >> + return 10 + i; >> +} >> + >> +static void >> +do_filler (void) >> +{ >> + char buf1[512], buf2[512]; >> + int f = fibonacci (4); >> + memcpy (buf1, buf2, f); >> +} >> + >> +static void >> +do_filler_shared (void) >> +{ >> + static char buf1[512], buf2[512]; >> + int f = fibonacci (4); >> + memcpy (buf1, buf2, f); >> +} >> + >> +#pragma GCC pop_options >> + >> +#define UNIT_WORK_CRT do_filler_shared () >> +#define UNIT_WORK_NON_CRT do_filler () >> + >> +static inline void >> +critical_section (int length) >> +{ >> + for (int i = length; i >= 0; i--) >> + UNIT_WORK_CRT; >> +} >> + >> +static inline void >> +non_critical_section (int length) >> +{ >> + for (int i = length; i >= 0; i--) >> + UNIT_WORK_NON_CRT; >> +} >> + >> +typedef struct Worker_Params >> +{ >> + long iters; >> + int crt_len; >> + int non_crt_len; >> + timing_t duration; >> +} Worker_Params; >> + >> +static void * >> +worker (void *v) >> +{ >> + timing_t start, stop; >> + Worker_Params *p = (Worker_Params *) v; >> + long iters = p->iters; >> + int crt_len = p->crt_len; >> + int non_crt_len = p->non_crt_len; >> + >> + pthread_barrier_wait (&barrier); >> + TIMING_NOW (start); >> + while (iters--) >> + { >> + pthread_mutex_lock (&lock); >> + critical_section (crt_len); >> + pthread_mutex_unlock (&lock); >> + non_critical_section (non_crt_len); >> + } >> + TIMING_NOW (stop); >> + >> + TIMING_DIFF (p->duration, start, stop); >> + return NULL; >> +} >> + >> +static double >> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) >> +{ >> + int i; >> + timing_t mean; >> + Worker_Params *p, params[num_threads]; >> + pthread_t threads[num_threads]; >> + >> + pthread_mutex_init (&lock, &attr); >> + pthread_barrier_init (&barrier, NULL, num_threads); >> + >> + for (i = 0; i < num_threads; i++) >> + { >> + p = ¶ms[i]; >> + p->iters = iters; >> + p->crt_len = crt_len; >> + p->non_crt_len = non_crt_len; >> + pthread_create (&threads[i], NULL, worker, (void *) p); >> + } >> + for (i = 0; i < num_threads; i++) >> + pthread_join (threads[i], NULL); >> + >> + pthread_mutex_destroy (&lock); >> + pthread_barrier_destroy (&barrier); >> + >> + mean = 0; >> + for (i = 0; i < num_threads; i++) >> + mean += params[i].duration; >> + mean /= num_threads; >> + return mean; >> +} >> + >> +#define RUN_COUNT 10 >> +#define MIN_TEST_SEC 0.01 >> + >> +static void >> +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len, >> + json_ctx_t *js) >> +{ >> + timing_t cur; >> + struct timeval ts, te; >> + double tsd, ted, td; >> + long iters, iters_limit, total_iters; >> + timing_t curs[RUN_COUNT + 2]; >> + int i, j; >> + double mean, stdev; >> + >> + iters = START_ITERS; >> + iters_limit = LONG_MAX / 100; >> + >> + while (1) >> + { >> + gettimeofday (&ts, NULL); >> + cur = do_one_test (num_threads, crt_len, non_crt_len, iters); >> + gettimeofday (&te, NULL); >> + /* Make sure the test to run at least MIN_TEST_SEC. */ >> + tsd = ts.tv_sec + ts.tv_usec / 1000000.0; >> + ted = te.tv_sec + te.tv_usec / 1000000.0; >> + td = ted - tsd; >> + if (td >= MIN_TEST_SEC || iters >= iters_limit) >> + break; >> + >> + iters *= 10; >> + } >> + >> + curs[0] = cur; >> + for (i = 1; i < RUN_COUNT + 2; i++) >> + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters); >> + >> + /* Sort the results so we can discard the fastest and slowest >> + times as outliers. */ >> + for (i = 0; i < RUN_COUNT + 1; i++) >> + for (j = i + 1; j < RUN_COUNT + 2; j++) >> + if (curs[i] > curs[j]) >> + { >> + timing_t temp = curs[i]; >> + curs[i] = curs[j]; >> + curs[j] = temp; >> + } >> + >> + /* Calculate mean and standard deviation. */ >> + mean = 0.0; >> + total_iters = iters * num_threads; >> + for (i = 1; i < RUN_COUNT + 1; i++) >> + mean += (double) curs[i] / (double) total_iters; >> + mean /= RUN_COUNT; >> + >> + stdev = 0.0; >> + for (i = 1; i < RUN_COUNT + 1; i++) >> + { >> + double s = (double) curs[i] / (double) total_iters - mean; >> + stdev += s * s; >> + } >> + stdev = sqrt (stdev / (RUN_COUNT - 1)); >> + >> + char buf[256]; >> + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name, >> + non_crt_len, crt_len, num_threads); >> + >> + json_attr_object_begin (js, buf); >> + >> + json_attr_double (js, "duration", (double) cur); >> + json_attr_double (js, "iterations", (double) total_iters); >> + json_attr_double (js, "mean", mean); >> + json_attr_double (js, "stdev", stdev); >> + json_attr_double (js, "min-outlier", >> + (double) curs[0] / (double) total_iters); >> + json_attr_double (js, "min", (double) curs[1] / (double) total_iters); >> + json_attr_double (js, "max", >> + (double) curs[RUN_COUNT] / (double) total_iters); >> + json_attr_double (js, "max-outlier", >> + (double) curs[RUN_COUNT + 1] / (double) total_iters); >> + >> + json_attr_object_end (js); >> +} >> + >> +#define TH_CONF_MAX 10 >> + >> +int >> +do_bench (void) >> +{ >> + int rv = 0; >> + json_ctx_t json_ctx; >> + int i, j, k; >> + int th_num, th_conf, nprocs; >> + int threads[TH_CONF_MAX]; >> + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 }; >> + int non_crt_lens[] = { 1, 32, 128 }; >> + char name[128]; >> + >> + json_init (&json_ctx, 2, stdout); >> + json_attr_object_begin (&json_ctx, "pthread_mutex_locks"); >> + >> + /* The thread config begins from 1, and increases by 2x until nprocs. >> + We also wants to test over-saturation case (1.25*nprocs). */ >> + nprocs = get_nprocs (); >> + th_num = 1; >> + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++) >> + { >> + threads[th_conf] = th_num; >> + th_num <<= 1; >> + } >> + threads[th_conf++] = nprocs; >> + threads[th_conf++] = nprocs + nprocs / 4; >> + >> + pthread_mutexattr_init (&attr); >> + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP); >> + snprintf (name, sizeof name, "type=adaptive"); >> + >> + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++) >> + { >> + int non_crt_len = non_crt_lens[k]; >> + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++) >> + { >> + int crt_len = crt_lens[j]; >> + for (i = 0; i < th_conf; i++) >> + { >> + th_num = threads[i]; >> + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx); >> + } >> + } >> + } >> + >> + json_attr_object_end (&json_ctx); >> + >> + return rv; >> +} >> + >> +#define TEST_FUNCTION do_bench () >> + >> +#include "../test-skeleton.c" >> -- >> 2.35.1 >> > > Can you run clang-format on this? Otherwise > LGTM. > clang-format done. Nothing needs to change for this patch.
On Thu, Apr 21, 2022 at 5:58 PM Guo, Wangyang <wangyang.guo@intel.com> wrote: > > On 4/21/2022 9:13 PM, Noah Goldstein via Libc-alpha wrote: > > On Wed, Apr 20, 2022 at 10:29 PM Wangyang Guo <wangyang.guo@intel.com> wrote: > >> > >> Benchmark for testing pthread mutex locks performance with different > >> threads and critical sections. > >> > >> The test configuration consists of 3 parts: > >> 1. thread number > >> 2. critical-section length > >> 3. non-critical-section length > >> > >> Thread number starts from 1 and increased by 2x until num of CPU cores > >> (nprocs). An additional over-saturation case (1.25 * nprocs) is also > >> included. > >> Critical-section is represented by a loop of shared do_filler(), > >> length can be determined by the loop iters. > >> Non-critical-section is similiar to the critical-section, except it's > >> based on non-shared do_filler(). > >> > >> Currently, adaptive pthread_mutex lock is tested. > >> > >> v2: Fix benchout json schema validation error. > >> --- > >> benchtests/Makefile | 2 + > >> benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++ > >> 2 files changed, 290 insertions(+) > >> create mode 100644 benchtests/bench-pthread-mutex-locks.c > >> > >> diff --git a/benchtests/Makefile b/benchtests/Makefile > >> index 8dfca592fd..b477042e6c 100644 > >> --- a/benchtests/Makefile > >> +++ b/benchtests/Makefile > >> @@ -102,6 +102,7 @@ endif > >> > >> bench-pthread := \ > >> pthread-locks \ > >> + pthread-mutex-locks \ > >> pthread_once \ > >> thread_create \ > >> # bench-pthread > >> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests) > >> $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests) > >> $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests) > >> $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests) > >> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests) > >> > >> > >> > >> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c > >> new file mode 100644 > >> index 0000000000..e934b0001a > >> --- /dev/null > >> +++ b/benchtests/bench-pthread-mutex-locks.c > >> @@ -0,0 +1,288 @@ > >> +/* Measure mutex_lock for different threads and critical sections. > >> + Copyright (C) 2020-2022 Free Software Foundation, Inc. > >> + This file is part of the GNU C Library. > >> + > >> + The GNU C Library is free software; you can redistribute it and/or > >> + modify it under the terms of the GNU Lesser General Public > >> + License as published by the Free Software Foundation; either > >> + version 2.1 of the License, or (at your option) any later version. > >> + > >> + The GNU C Library is distributed in the hope that it will be useful, > >> + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> + Lesser General Public License for more details. > >> + > >> + You should have received a copy of the GNU Lesser General Public > >> + License along with the GNU C Library; if not, see > >> + <https://www.gnu.org/licenses/>. */ > >> + > >> +#define TEST_MAIN > >> +#define TEST_NAME "pthread-mutex-locks" > >> +#define TIMEOUT (20 * 60) > >> + > >> +#include <stdio.h> > >> +#include <stdlib.h> > >> +#include <string.h> > >> +#include <unistd.h> > >> +#include <math.h> > >> +#include <pthread.h> > >> +#include <sys/time.h> > >> +#include <sys/sysinfo.h> > >> +#include "bench-timing.h" > >> +#include "json-lib.h" > >> + > >> +static pthread_mutex_t lock; > >> +static pthread_mutexattr_t attr; > >> +static pthread_barrier_t barrier; > >> + > >> +#define START_ITERS 1000 > >> + > >> +#pragma GCC push_options > >> +#pragma GCC optimize(1) > >> + > >> +static int __attribute__ ((noinline)) fibonacci (int i) > >> +{ > >> + asm(""); > >> + if (i > 2) > >> + return fibonacci (i - 1) + fibonacci (i - 2); > >> + return 10 + i; > >> +} > >> + > >> +static void > >> +do_filler (void) > >> +{ > >> + char buf1[512], buf2[512]; > >> + int f = fibonacci (4); > >> + memcpy (buf1, buf2, f); > >> +} > >> + > >> +static void > >> +do_filler_shared (void) > >> +{ > >> + static char buf1[512], buf2[512]; > >> + int f = fibonacci (4); > >> + memcpy (buf1, buf2, f); > >> +} > >> + > >> +#pragma GCC pop_options > >> + > >> +#define UNIT_WORK_CRT do_filler_shared () > >> +#define UNIT_WORK_NON_CRT do_filler () > >> + > >> +static inline void > >> +critical_section (int length) > >> +{ > >> + for (int i = length; i >= 0; i--) > >> + UNIT_WORK_CRT; > >> +} > >> + > >> +static inline void > >> +non_critical_section (int length) > >> +{ > >> + for (int i = length; i >= 0; i--) > >> + UNIT_WORK_NON_CRT; > >> +} > >> + > >> +typedef struct Worker_Params > >> +{ > >> + long iters; > >> + int crt_len; > >> + int non_crt_len; > >> + timing_t duration; > >> +} Worker_Params; > >> + > >> +static void * > >> +worker (void *v) > >> +{ > >> + timing_t start, stop; > >> + Worker_Params *p = (Worker_Params *) v; > >> + long iters = p->iters; > >> + int crt_len = p->crt_len; > >> + int non_crt_len = p->non_crt_len; > >> + > >> + pthread_barrier_wait (&barrier); > >> + TIMING_NOW (start); > >> + while (iters--) > >> + { > >> + pthread_mutex_lock (&lock); > >> + critical_section (crt_len); > >> + pthread_mutex_unlock (&lock); > >> + non_critical_section (non_crt_len); > >> + } > >> + TIMING_NOW (stop); > >> + > >> + TIMING_DIFF (p->duration, start, stop); > >> + return NULL; > >> +} > >> + > >> +static double > >> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) > >> +{ > >> + int i; > >> + timing_t mean; > >> + Worker_Params *p, params[num_threads]; > >> + pthread_t threads[num_threads]; > >> + > >> + pthread_mutex_init (&lock, &attr); > >> + pthread_barrier_init (&barrier, NULL, num_threads); > >> + > >> + for (i = 0; i < num_threads; i++) > >> + { > >> + p = ¶ms[i]; > >> + p->iters = iters; > >> + p->crt_len = crt_len; > >> + p->non_crt_len = non_crt_len; > >> + pthread_create (&threads[i], NULL, worker, (void *) p); > >> + } > >> + for (i = 0; i < num_threads; i++) > >> + pthread_join (threads[i], NULL); > >> + > >> + pthread_mutex_destroy (&lock); > >> + pthread_barrier_destroy (&barrier); > >> + > >> + mean = 0; > >> + for (i = 0; i < num_threads; i++) > >> + mean += params[i].duration; > >> + mean /= num_threads; > >> + return mean; > >> +} > >> + > >> +#define RUN_COUNT 10 > >> +#define MIN_TEST_SEC 0.01 > >> + > >> +static void > >> +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len, > >> + json_ctx_t *js) > >> +{ > >> + timing_t cur; > >> + struct timeval ts, te; > >> + double tsd, ted, td; > >> + long iters, iters_limit, total_iters; > >> + timing_t curs[RUN_COUNT + 2]; > >> + int i, j; > >> + double mean, stdev; > >> + > >> + iters = START_ITERS; > >> + iters_limit = LONG_MAX / 100; > >> + > >> + while (1) > >> + { > >> + gettimeofday (&ts, NULL); > >> + cur = do_one_test (num_threads, crt_len, non_crt_len, iters); > >> + gettimeofday (&te, NULL); > >> + /* Make sure the test to run at least MIN_TEST_SEC. */ > >> + tsd = ts.tv_sec + ts.tv_usec / 1000000.0; > >> + ted = te.tv_sec + te.tv_usec / 1000000.0; > >> + td = ted - tsd; > >> + if (td >= MIN_TEST_SEC || iters >= iters_limit) > >> + break; > >> + > >> + iters *= 10; > >> + } > >> + > >> + curs[0] = cur; > >> + for (i = 1; i < RUN_COUNT + 2; i++) > >> + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters); > >> + > >> + /* Sort the results so we can discard the fastest and slowest > >> + times as outliers. */ > >> + for (i = 0; i < RUN_COUNT + 1; i++) > >> + for (j = i + 1; j < RUN_COUNT + 2; j++) > >> + if (curs[i] > curs[j]) > >> + { > >> + timing_t temp = curs[i]; > >> + curs[i] = curs[j]; > >> + curs[j] = temp; > >> + } > >> + > >> + /* Calculate mean and standard deviation. */ > >> + mean = 0.0; > >> + total_iters = iters * num_threads; > >> + for (i = 1; i < RUN_COUNT + 1; i++) > >> + mean += (double) curs[i] / (double) total_iters; > >> + mean /= RUN_COUNT; > >> + > >> + stdev = 0.0; > >> + for (i = 1; i < RUN_COUNT + 1; i++) > >> + { > >> + double s = (double) curs[i] / (double) total_iters - mean; > >> + stdev += s * s; > >> + } > >> + stdev = sqrt (stdev / (RUN_COUNT - 1)); > >> + > >> + char buf[256]; > >> + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name, > >> + non_crt_len, crt_len, num_threads); > >> + > >> + json_attr_object_begin (js, buf); > >> + > >> + json_attr_double (js, "duration", (double) cur); > >> + json_attr_double (js, "iterations", (double) total_iters); > >> + json_attr_double (js, "mean", mean); > >> + json_attr_double (js, "stdev", stdev); > >> + json_attr_double (js, "min-outlier", > >> + (double) curs[0] / (double) total_iters); > >> + json_attr_double (js, "min", (double) curs[1] / (double) total_iters); > >> + json_attr_double (js, "max", > >> + (double) curs[RUN_COUNT] / (double) total_iters); > >> + json_attr_double (js, "max-outlier", > >> + (double) curs[RUN_COUNT + 1] / (double) total_iters); > >> + > >> + json_attr_object_end (js); > >> +} > >> + > >> +#define TH_CONF_MAX 10 > >> + > >> +int > >> +do_bench (void) > >> +{ > >> + int rv = 0; > >> + json_ctx_t json_ctx; > >> + int i, j, k; > >> + int th_num, th_conf, nprocs; > >> + int threads[TH_CONF_MAX]; > >> + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 }; > >> + int non_crt_lens[] = { 1, 32, 128 }; > >> + char name[128]; > >> + > >> + json_init (&json_ctx, 2, stdout); > >> + json_attr_object_begin (&json_ctx, "pthread_mutex_locks"); > >> + > >> + /* The thread config begins from 1, and increases by 2x until nprocs. > >> + We also wants to test over-saturation case (1.25*nprocs). */ > >> + nprocs = get_nprocs (); > >> + th_num = 1; > >> + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++) > >> + { > >> + threads[th_conf] = th_num; > >> + th_num <<= 1; > >> + } > >> + threads[th_conf++] = nprocs; > >> + threads[th_conf++] = nprocs + nprocs / 4; > >> + > >> + pthread_mutexattr_init (&attr); > >> + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP); > >> + snprintf (name, sizeof name, "type=adaptive"); > >> + > >> + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++) > >> + { > >> + int non_crt_len = non_crt_lens[k]; > >> + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++) > >> + { > >> + int crt_len = crt_lens[j]; > >> + for (i = 0; i < th_conf; i++) > >> + { > >> + th_num = threads[i]; > >> + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx); > >> + } > >> + } > >> + } > >> + > >> + json_attr_object_end (&json_ctx); > >> + > >> + return rv; > >> +} > >> + > >> +#define TEST_FUNCTION do_bench () > >> + > >> +#include "../test-skeleton.c" > >> -- > >> 2.35.1 > >> > > > > Can you run clang-format on this? Otherwise > > LGTM. > > > > clang-format done. > Nothing needs to change for this patch. Woops. LGTM.
On Sat, Apr 23, 2022 at 8:04 PM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Thu, Apr 21, 2022 at 5:58 PM Guo, Wangyang <wangyang.guo@intel.com> wrote: > > > > On 4/21/2022 9:13 PM, Noah Goldstein via Libc-alpha wrote: > > > On Wed, Apr 20, 2022 at 10:29 PM Wangyang Guo <wangyang.guo@intel.com> wrote: > > >> > > >> Benchmark for testing pthread mutex locks performance with different > > >> threads and critical sections. > > >> > > >> The test configuration consists of 3 parts: > > >> 1. thread number > > >> 2. critical-section length > > >> 3. non-critical-section length > > >> > > >> Thread number starts from 1 and increased by 2x until num of CPU cores > > >> (nprocs). An additional over-saturation case (1.25 * nprocs) is also > > >> included. > > >> Critical-section is represented by a loop of shared do_filler(), > > >> length can be determined by the loop iters. > > >> Non-critical-section is similiar to the critical-section, except it's > > >> based on non-shared do_filler(). > > >> > > >> Currently, adaptive pthread_mutex lock is tested. > > >> > > >> v2: Fix benchout json schema validation error. > > >> --- > > >> benchtests/Makefile | 2 + > > >> benchtests/bench-pthread-mutex-locks.c | 288 +++++++++++++++++++++++++ > > >> 2 files changed, 290 insertions(+) > > >> create mode 100644 benchtests/bench-pthread-mutex-locks.c > > >> > > >> diff --git a/benchtests/Makefile b/benchtests/Makefile > > >> index 8dfca592fd..b477042e6c 100644 > > >> --- a/benchtests/Makefile > > >> +++ b/benchtests/Makefile > > >> @@ -102,6 +102,7 @@ endif > > >> > > >> bench-pthread := \ > > >> pthread-locks \ > > >> + pthread-mutex-locks \ > > >> pthread_once \ > > >> thread_create \ > > >> # bench-pthread > > >> @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests) > > >> $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests) > > >> $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests) > > >> $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests) > > >> +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests) > > >> > > >> > > >> > > >> diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c > > >> new file mode 100644 > > >> index 0000000000..e934b0001a > > >> --- /dev/null > > >> +++ b/benchtests/bench-pthread-mutex-locks.c > > >> @@ -0,0 +1,288 @@ > > >> +/* Measure mutex_lock for different threads and critical sections. > > >> + Copyright (C) 2020-2022 Free Software Foundation, Inc. > > >> + This file is part of the GNU C Library. > > >> + > > >> + The GNU C Library is free software; you can redistribute it and/or > > >> + modify it under the terms of the GNU Lesser General Public > > >> + License as published by the Free Software Foundation; either > > >> + version 2.1 of the License, or (at your option) any later version. > > >> + > > >> + The GNU C Library is distributed in the hope that it will be useful, > > >> + but WITHOUT ANY WARRANTY; without even the implied warranty of > > >> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > >> + Lesser General Public License for more details. > > >> + > > >> + You should have received a copy of the GNU Lesser General Public > > >> + License along with the GNU C Library; if not, see > > >> + <https://www.gnu.org/licenses/>. */ > > >> + > > >> +#define TEST_MAIN > > >> +#define TEST_NAME "pthread-mutex-locks" > > >> +#define TIMEOUT (20 * 60) > > >> + > > >> +#include <stdio.h> > > >> +#include <stdlib.h> > > >> +#include <string.h> > > >> +#include <unistd.h> > > >> +#include <math.h> > > >> +#include <pthread.h> > > >> +#include <sys/time.h> > > >> +#include <sys/sysinfo.h> > > >> +#include "bench-timing.h" > > >> +#include "json-lib.h" > > >> + > > >> +static pthread_mutex_t lock; > > >> +static pthread_mutexattr_t attr; > > >> +static pthread_barrier_t barrier; > > >> + > > >> +#define START_ITERS 1000 > > >> + > > >> +#pragma GCC push_options > > >> +#pragma GCC optimize(1) > > >> + > > >> +static int __attribute__ ((noinline)) fibonacci (int i) > > >> +{ > > >> + asm(""); > > >> + if (i > 2) > > >> + return fibonacci (i - 1) + fibonacci (i - 2); > > >> + return 10 + i; > > >> +} > > >> + > > >> +static void > > >> +do_filler (void) > > >> +{ > > >> + char buf1[512], buf2[512]; > > >> + int f = fibonacci (4); > > >> + memcpy (buf1, buf2, f); > > >> +} > > >> + > > >> +static void > > >> +do_filler_shared (void) > > >> +{ > > >> + static char buf1[512], buf2[512]; > > >> + int f = fibonacci (4); > > >> + memcpy (buf1, buf2, f); > > >> +} > > >> + > > >> +#pragma GCC pop_options > > >> + > > >> +#define UNIT_WORK_CRT do_filler_shared () > > >> +#define UNIT_WORK_NON_CRT do_filler () > > >> + > > >> +static inline void > > >> +critical_section (int length) > > >> +{ > > >> + for (int i = length; i >= 0; i--) > > >> + UNIT_WORK_CRT; > > >> +} > > >> + > > >> +static inline void > > >> +non_critical_section (int length) > > >> +{ > > >> + for (int i = length; i >= 0; i--) > > >> + UNIT_WORK_NON_CRT; > > >> +} > > >> + > > >> +typedef struct Worker_Params > > >> +{ > > >> + long iters; > > >> + int crt_len; > > >> + int non_crt_len; > > >> + timing_t duration; > > >> +} Worker_Params; > > >> + > > >> +static void * > > >> +worker (void *v) > > >> +{ > > >> + timing_t start, stop; > > >> + Worker_Params *p = (Worker_Params *) v; > > >> + long iters = p->iters; > > >> + int crt_len = p->crt_len; > > >> + int non_crt_len = p->non_crt_len; > > >> + > > >> + pthread_barrier_wait (&barrier); > > >> + TIMING_NOW (start); > > >> + while (iters--) > > >> + { > > >> + pthread_mutex_lock (&lock); > > >> + critical_section (crt_len); > > >> + pthread_mutex_unlock (&lock); > > >> + non_critical_section (non_crt_len); > > >> + } > > >> + TIMING_NOW (stop); > > >> + > > >> + TIMING_DIFF (p->duration, start, stop); > > >> + return NULL; > > >> +} > > >> + > > >> +static double > > >> +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) > > >> +{ > > >> + int i; > > >> + timing_t mean; > > >> + Worker_Params *p, params[num_threads]; > > >> + pthread_t threads[num_threads]; > > >> + > > >> + pthread_mutex_init (&lock, &attr); > > >> + pthread_barrier_init (&barrier, NULL, num_threads); > > >> + > > >> + for (i = 0; i < num_threads; i++) > > >> + { > > >> + p = ¶ms[i]; > > >> + p->iters = iters; > > >> + p->crt_len = crt_len; > > >> + p->non_crt_len = non_crt_len; > > >> + pthread_create (&threads[i], NULL, worker, (void *) p); > > >> + } > > >> + for (i = 0; i < num_threads; i++) > > >> + pthread_join (threads[i], NULL); > > >> + > > >> + pthread_mutex_destroy (&lock); > > >> + pthread_barrier_destroy (&barrier); > > >> + > > >> + mean = 0; > > >> + for (i = 0; i < num_threads; i++) > > >> + mean += params[i].duration; > > >> + mean /= num_threads; > > >> + return mean; > > >> +} > > >> + > > >> +#define RUN_COUNT 10 > > >> +#define MIN_TEST_SEC 0.01 > > >> + > > >> +static void > > >> +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len, > > >> + json_ctx_t *js) > > >> +{ > > >> + timing_t cur; > > >> + struct timeval ts, te; > > >> + double tsd, ted, td; > > >> + long iters, iters_limit, total_iters; > > >> + timing_t curs[RUN_COUNT + 2]; > > >> + int i, j; > > >> + double mean, stdev; > > >> + > > >> + iters = START_ITERS; > > >> + iters_limit = LONG_MAX / 100; > > >> + > > >> + while (1) > > >> + { > > >> + gettimeofday (&ts, NULL); > > >> + cur = do_one_test (num_threads, crt_len, non_crt_len, iters); > > >> + gettimeofday (&te, NULL); > > >> + /* Make sure the test to run at least MIN_TEST_SEC. */ > > >> + tsd = ts.tv_sec + ts.tv_usec / 1000000.0; > > >> + ted = te.tv_sec + te.tv_usec / 1000000.0; > > >> + td = ted - tsd; > > >> + if (td >= MIN_TEST_SEC || iters >= iters_limit) > > >> + break; > > >> + > > >> + iters *= 10; > > >> + } > > >> + > > >> + curs[0] = cur; > > >> + for (i = 1; i < RUN_COUNT + 2; i++) > > >> + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters); > > >> + > > >> + /* Sort the results so we can discard the fastest and slowest > > >> + times as outliers. */ > > >> + for (i = 0; i < RUN_COUNT + 1; i++) > > >> + for (j = i + 1; j < RUN_COUNT + 2; j++) > > >> + if (curs[i] > curs[j]) > > >> + { > > >> + timing_t temp = curs[i]; > > >> + curs[i] = curs[j]; > > >> + curs[j] = temp; > > >> + } > > >> + > > >> + /* Calculate mean and standard deviation. */ > > >> + mean = 0.0; > > >> + total_iters = iters * num_threads; > > >> + for (i = 1; i < RUN_COUNT + 1; i++) > > >> + mean += (double) curs[i] / (double) total_iters; > > >> + mean /= RUN_COUNT; > > >> + > > >> + stdev = 0.0; > > >> + for (i = 1; i < RUN_COUNT + 1; i++) > > >> + { > > >> + double s = (double) curs[i] / (double) total_iters - mean; > > >> + stdev += s * s; > > >> + } > > >> + stdev = sqrt (stdev / (RUN_COUNT - 1)); > > >> + > > >> + char buf[256]; > > >> + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name, > > >> + non_crt_len, crt_len, num_threads); > > >> + > > >> + json_attr_object_begin (js, buf); > > >> + > > >> + json_attr_double (js, "duration", (double) cur); > > >> + json_attr_double (js, "iterations", (double) total_iters); > > >> + json_attr_double (js, "mean", mean); > > >> + json_attr_double (js, "stdev", stdev); > > >> + json_attr_double (js, "min-outlier", > > >> + (double) curs[0] / (double) total_iters); > > >> + json_attr_double (js, "min", (double) curs[1] / (double) total_iters); > > >> + json_attr_double (js, "max", > > >> + (double) curs[RUN_COUNT] / (double) total_iters); > > >> + json_attr_double (js, "max-outlier", > > >> + (double) curs[RUN_COUNT + 1] / (double) total_iters); > > >> + > > >> + json_attr_object_end (js); > > >> +} > > >> + > > >> +#define TH_CONF_MAX 10 > > >> + > > >> +int > > >> +do_bench (void) > > >> +{ > > >> + int rv = 0; > > >> + json_ctx_t json_ctx; > > >> + int i, j, k; > > >> + int th_num, th_conf, nprocs; > > >> + int threads[TH_CONF_MAX]; > > >> + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 }; > > >> + int non_crt_lens[] = { 1, 32, 128 }; > > >> + char name[128]; > > >> + > > >> + json_init (&json_ctx, 2, stdout); > > >> + json_attr_object_begin (&json_ctx, "pthread_mutex_locks"); > > >> + > > >> + /* The thread config begins from 1, and increases by 2x until nprocs. > > >> + We also wants to test over-saturation case (1.25*nprocs). */ > > >> + nprocs = get_nprocs (); > > >> + th_num = 1; > > >> + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++) > > >> + { > > >> + threads[th_conf] = th_num; > > >> + th_num <<= 1; > > >> + } > > >> + threads[th_conf++] = nprocs; > > >> + threads[th_conf++] = nprocs + nprocs / 4; > > >> + > > >> + pthread_mutexattr_init (&attr); > > >> + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP); > > >> + snprintf (name, sizeof name, "type=adaptive"); > > >> + > > >> + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++) > > >> + { > > >> + int non_crt_len = non_crt_lens[k]; > > >> + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++) > > >> + { > > >> + int crt_len = crt_lens[j]; > > >> + for (i = 0; i < th_conf; i++) > > >> + { > > >> + th_num = threads[i]; > > >> + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx); > > >> + } > > >> + } > > >> + } > > >> + > > >> + json_attr_object_end (&json_ctx); > > >> + > > >> + return rv; > > >> +} > > >> + > > >> +#define TEST_FUNCTION do_bench () > > >> + > > >> +#include "../test-skeleton.c" > > >> -- > > >> 2.35.1 > > >> > > > > > > Can you run clang-format on this? Otherwise > > > LGTM. > > > > > > > clang-format done. > > Nothing needs to change for this patch. > > > Woops. > > LGTM. I am pushing it now. Thanks.
diff --git a/benchtests/Makefile b/benchtests/Makefile index 8dfca592fd..b477042e6c 100644 --- a/benchtests/Makefile +++ b/benchtests/Makefile @@ -102,6 +102,7 @@ endif bench-pthread := \ pthread-locks \ + pthread-mutex-locks \ pthread_once \ thread_create \ # bench-pthread @@ -281,6 +282,7 @@ $(addprefix $(objpfx)bench-,$(math-benchset)): $(libm-benchtests) $(addprefix $(objpfx)bench-,$(bench-pthread)): $(thread-library-benchtests) $(addprefix $(objpfx)bench-,$(bench-malloc)): $(thread-library-benchtests) $(addprefix $(objpfx)bench-,pthread-locks): $(libm-benchtests) +$(addprefix $(objpfx)bench-,pthread-mutex-locks): $(libm-benchtests) diff --git a/benchtests/bench-pthread-mutex-locks.c b/benchtests/bench-pthread-mutex-locks.c new file mode 100644 index 0000000000..e934b0001a --- /dev/null +++ b/benchtests/bench-pthread-mutex-locks.c @@ -0,0 +1,288 @@ +/* Measure mutex_lock for different threads and critical sections. + Copyright (C) 2020-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define TEST_MAIN +#define TEST_NAME "pthread-mutex-locks" +#define TIMEOUT (20 * 60) + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <math.h> +#include <pthread.h> +#include <sys/time.h> +#include <sys/sysinfo.h> +#include "bench-timing.h" +#include "json-lib.h" + +static pthread_mutex_t lock; +static pthread_mutexattr_t attr; +static pthread_barrier_t barrier; + +#define START_ITERS 1000 + +#pragma GCC push_options +#pragma GCC optimize(1) + +static int __attribute__ ((noinline)) fibonacci (int i) +{ + asm(""); + if (i > 2) + return fibonacci (i - 1) + fibonacci (i - 2); + return 10 + i; +} + +static void +do_filler (void) +{ + char buf1[512], buf2[512]; + int f = fibonacci (4); + memcpy (buf1, buf2, f); +} + +static void +do_filler_shared (void) +{ + static char buf1[512], buf2[512]; + int f = fibonacci (4); + memcpy (buf1, buf2, f); +} + +#pragma GCC pop_options + +#define UNIT_WORK_CRT do_filler_shared () +#define UNIT_WORK_NON_CRT do_filler () + +static inline void +critical_section (int length) +{ + for (int i = length; i >= 0; i--) + UNIT_WORK_CRT; +} + +static inline void +non_critical_section (int length) +{ + for (int i = length; i >= 0; i--) + UNIT_WORK_NON_CRT; +} + +typedef struct Worker_Params +{ + long iters; + int crt_len; + int non_crt_len; + timing_t duration; +} Worker_Params; + +static void * +worker (void *v) +{ + timing_t start, stop; + Worker_Params *p = (Worker_Params *) v; + long iters = p->iters; + int crt_len = p->crt_len; + int non_crt_len = p->non_crt_len; + + pthread_barrier_wait (&barrier); + TIMING_NOW (start); + while (iters--) + { + pthread_mutex_lock (&lock); + critical_section (crt_len); + pthread_mutex_unlock (&lock); + non_critical_section (non_crt_len); + } + TIMING_NOW (stop); + + TIMING_DIFF (p->duration, start, stop); + return NULL; +} + +static double +do_one_test (int num_threads, int crt_len, int non_crt_len, long iters) +{ + int i; + timing_t mean; + Worker_Params *p, params[num_threads]; + pthread_t threads[num_threads]; + + pthread_mutex_init (&lock, &attr); + pthread_barrier_init (&barrier, NULL, num_threads); + + for (i = 0; i < num_threads; i++) + { + p = ¶ms[i]; + p->iters = iters; + p->crt_len = crt_len; + p->non_crt_len = non_crt_len; + pthread_create (&threads[i], NULL, worker, (void *) p); + } + for (i = 0; i < num_threads; i++) + pthread_join (threads[i], NULL); + + pthread_mutex_destroy (&lock); + pthread_barrier_destroy (&barrier); + + mean = 0; + for (i = 0; i < num_threads; i++) + mean += params[i].duration; + mean /= num_threads; + return mean; +} + +#define RUN_COUNT 10 +#define MIN_TEST_SEC 0.01 + +static void +do_bench_one (const char *name, int num_threads, int crt_len, int non_crt_len, + json_ctx_t *js) +{ + timing_t cur; + struct timeval ts, te; + double tsd, ted, td; + long iters, iters_limit, total_iters; + timing_t curs[RUN_COUNT + 2]; + int i, j; + double mean, stdev; + + iters = START_ITERS; + iters_limit = LONG_MAX / 100; + + while (1) + { + gettimeofday (&ts, NULL); + cur = do_one_test (num_threads, crt_len, non_crt_len, iters); + gettimeofday (&te, NULL); + /* Make sure the test to run at least MIN_TEST_SEC. */ + tsd = ts.tv_sec + ts.tv_usec / 1000000.0; + ted = te.tv_sec + te.tv_usec / 1000000.0; + td = ted - tsd; + if (td >= MIN_TEST_SEC || iters >= iters_limit) + break; + + iters *= 10; + } + + curs[0] = cur; + for (i = 1; i < RUN_COUNT + 2; i++) + curs[i] = do_one_test (num_threads, crt_len, non_crt_len, iters); + + /* Sort the results so we can discard the fastest and slowest + times as outliers. */ + for (i = 0; i < RUN_COUNT + 1; i++) + for (j = i + 1; j < RUN_COUNT + 2; j++) + if (curs[i] > curs[j]) + { + timing_t temp = curs[i]; + curs[i] = curs[j]; + curs[j] = temp; + } + + /* Calculate mean and standard deviation. */ + mean = 0.0; + total_iters = iters * num_threads; + for (i = 1; i < RUN_COUNT + 1; i++) + mean += (double) curs[i] / (double) total_iters; + mean /= RUN_COUNT; + + stdev = 0.0; + for (i = 1; i < RUN_COUNT + 1; i++) + { + double s = (double) curs[i] / (double) total_iters - mean; + stdev += s * s; + } + stdev = sqrt (stdev / (RUN_COUNT - 1)); + + char buf[256]; + snprintf (buf, sizeof buf, "%s,non_crt_len=%d,crt_len=%d,threads=%d", name, + non_crt_len, crt_len, num_threads); + + json_attr_object_begin (js, buf); + + json_attr_double (js, "duration", (double) cur); + json_attr_double (js, "iterations", (double) total_iters); + json_attr_double (js, "mean", mean); + json_attr_double (js, "stdev", stdev); + json_attr_double (js, "min-outlier", + (double) curs[0] / (double) total_iters); + json_attr_double (js, "min", (double) curs[1] / (double) total_iters); + json_attr_double (js, "max", + (double) curs[RUN_COUNT] / (double) total_iters); + json_attr_double (js, "max-outlier", + (double) curs[RUN_COUNT + 1] / (double) total_iters); + + json_attr_object_end (js); +} + +#define TH_CONF_MAX 10 + +int +do_bench (void) +{ + int rv = 0; + json_ctx_t json_ctx; + int i, j, k; + int th_num, th_conf, nprocs; + int threads[TH_CONF_MAX]; + int crt_lens[] = { 0, 1, 2, 4, 8, 16, 32, 64, 128 }; + int non_crt_lens[] = { 1, 32, 128 }; + char name[128]; + + json_init (&json_ctx, 2, stdout); + json_attr_object_begin (&json_ctx, "pthread_mutex_locks"); + + /* The thread config begins from 1, and increases by 2x until nprocs. + We also wants to test over-saturation case (1.25*nprocs). */ + nprocs = get_nprocs (); + th_num = 1; + for (th_conf = 0; th_conf < (TH_CONF_MAX - 2) && th_num < nprocs; th_conf++) + { + threads[th_conf] = th_num; + th_num <<= 1; + } + threads[th_conf++] = nprocs; + threads[th_conf++] = nprocs + nprocs / 4; + + pthread_mutexattr_init (&attr); + pthread_mutexattr_settype (&attr, PTHREAD_MUTEX_ADAPTIVE_NP); + snprintf (name, sizeof name, "type=adaptive"); + + for (k = 0; k < (sizeof (non_crt_lens) / sizeof (int)); k++) + { + int non_crt_len = non_crt_lens[k]; + for (j = 0; j < (sizeof (crt_lens) / sizeof (int)); j++) + { + int crt_len = crt_lens[j]; + for (i = 0; i < th_conf; i++) + { + th_num = threads[i]; + do_bench_one (name, th_num, crt_len, non_crt_len, &json_ctx); + } + } + } + + json_attr_object_end (&json_ctx); + + return rv; +} + +#define TEST_FUNCTION do_bench () + +#include "../test-skeleton.c"