diff mbox series

[3/7] benchtests: Add arc4random benchtest

Message ID 20220413202401.408267-4-adhemerval.zanella@linaro.org
State New
Headers show
Series Add arc4random support | expand

Commit Message

Adhemerval Zanella Netto April 13, 2022, 8:23 p.m. UTC
It shows both throughput (total bytes obtained in the test duration)
and latecy for both arc4random and arc4random_buf with different
sizes.

Checked on x86_64-linux-gnu, aarch64-linux, and powerpc64le-linux-gnu.
---
 benchtests/Makefile           |   6 +-
 benchtests/bench-arc4random.c | 243 ++++++++++++++++++++++++++++++++++
 2 files changed, 248 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-arc4random.c

Comments

Noah Goldstein April 14, 2022, 7:17 p.m. UTC | #1
On Wed, Apr 13, 2022 at 3:26 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> It shows both throughput (total bytes obtained in the test duration)
> and latecy for both arc4random and arc4random_buf with different
> sizes.
>
> Checked on x86_64-linux-gnu, aarch64-linux, and powerpc64le-linux-gnu.
> ---
>  benchtests/Makefile           |   6 +-
>  benchtests/bench-arc4random.c | 243 ++++++++++++++++++++++++++++++++++
>  2 files changed, 248 insertions(+), 1 deletion(-)
>  create mode 100644 benchtests/bench-arc4random.c
>
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 8dfca592fd..50b96dd71f 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -111,8 +111,12 @@ bench-string := \
>    ffsll \
>  # bench-string
>
> +bench-stdlib := \
> +  arc4random \
> +# bench-stdlib
> +
>  ifeq (${BENCHSET},)
> -bench := $(bench-math) $(bench-pthread) $(bench-string)
> +bench := $(bench-math) $(bench-pthread) $(bench-string) $(bench-stdlib)
>  else
>  bench := $(foreach B,$(filter bench-%,${BENCHSET}), ${${B}})
>  endif
> diff --git a/benchtests/bench-arc4random.c b/benchtests/bench-arc4random.c
> new file mode 100644
> index 0000000000..9e2ba9ba34
> --- /dev/null
> +++ b/benchtests/bench-arc4random.c
> @@ -0,0 +1,243 @@
> +/* arc4random benchmarks.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "bench-timing.h"
> +#include "json-lib.h"
> +#include <array_length.h>
> +#include <intprops.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <support/support.h>
> +#include <support/xthread.h>
> +
> +static volatile uint32_t r;
> +static volatile sig_atomic_t timer_finished;
> +
> +static void timer_callback (int unused)
> +{
> +  timer_finished = 1;
> +}
> +
> +static const uint32_t sizes[] = { 0, 16, 32, 64, 128 };
> +
> +static double
> +bench_arc4random_throughput (void)
> +{
> +  /* Run for approximately DURATION seconds, and it does not matter who
> +     receive the signal (so not need to mask it on main thread).  */
> +  timer_finished = 0;
> +  timer_t timer = support_create_timer (DURATION, 0, false, timer_callback);
> +
> +  uint64_t n = 0;
> +
> +  while (1)
> +    {
> +      r = arc4random ();
> +      n++;
> +
> +      if (timer_finished == 1)
> +       break;
> +    }
> +
> +  support_delete_timer (timer);
> +
> +  return (double) (n * sizeof (r)) / (double) DURATION;
> +}
> +
> +static double
> +bench_arc4random_latency (void)
> +{
> +  timing_t start, stop, cur;
> +  const size_t iters = 1024;
> +
> +  TIMING_NOW (start);
> +  for (size_t i = 0; i < iters; i++)
> +    r = arc4random ();
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  return (double) (cur) / (double) iters;
> +}
> +
> +static double
> +bench_arc4random_buf_throughput (size_t len)
> +{
> +  timer_finished = 0;
> +  timer_t timer = support_create_timer (DURATION, 0, false, timer_callback);
> +
> +  uint8_t buf[len];
> +
> +  uint64_t n = 0;
> +
> +  while (1)
> +    {
> +      arc4random_buf (buf, len);
> +      n++;
> +
> +      if (timer_finished == 1)
> +       break;
> +    }
> +
> +  support_delete_timer (timer);
> +
> +  uint64_t total = (n * len);
> +  return (double) (total) / (double) DURATION;
> +}
> +
> +static double
> +bench_arc4random_buf_latency (size_t len)
> +{
> +  timing_t start, stop, cur;
> +  const size_t iters = 1024;
> +
> +  uint8_t buf[len];
> +
> +  TIMING_NOW (start);
> +  for (size_t i = 0; i < iters; i++)
> +    arc4random_buf (buf, len);
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +
> +  return (double) (cur) / (double) iters;
> +}
> +
> +static void
> +bench_singlethread (json_ctx_t *json_ctx)
> +{
> +  json_element_object_begin (json_ctx);
> +
> +  json_array_begin (json_ctx, "throughput");
> +  for (int i = 0; i < array_length (sizes); i++)
> +    if (sizes[i] == 0)
> +      json_element_double (json_ctx, bench_arc4random_throughput ());
> +    else
> +      json_element_double (json_ctx, bench_arc4random_buf_throughput (sizes[i]));
> +  json_array_end (json_ctx);
> +
> +  json_array_begin (json_ctx, "latency");
> +  for (int i = 0; i < array_length (sizes); i++)
> +    if (sizes[i] == 0)
> +      json_element_double (json_ctx, bench_arc4random_latency ());
> +    else
> +      json_element_double (json_ctx, bench_arc4random_buf_latency (sizes[i]));
> +  json_array_end (json_ctx);
> +
> +  json_element_object_end (json_ctx);
> +}
> +
> +struct thr_arc4random_arg
> +{
> +  double ret;
> +  uint32_t val;
> +};
> +
> +static void *
> +thr_arc4random_throughput (void *closure)
> +{
> +  struct thr_arc4random_arg *arg = closure;
> +  arg->ret = arg->val == 0 ? bench_arc4random_throughput ()
> +                          : bench_arc4random_buf_throughput (arg->val);
> +  return NULL;
> +}
> +
> +static void *
> +thr_arc4random_latency (void *closure)
> +{
> +  struct thr_arc4random_arg *arg = closure;
> +  arg->ret = arg->val == 0 ? bench_arc4random_latency ()
> +                          : bench_arc4random_buf_latency (arg->val);
> +  return NULL;
> +}

I think the expectation is that the chacha calls will be cold,
maybe it is worth adding a cache flush of sorts between
calls. It may be some prefetching in the start will help the code in
that case but would only be a regression with the hot in L1
benchmarks.

Can wait though this V1 looks fine.
> +
> +static void
> +bench_threaded (json_ctx_t *json_ctx)
> +{
> +  json_element_object_begin (json_ctx);
> +
> +  json_array_begin (json_ctx, "throughput");
> +  for (int i = 0; i < array_length (sizes); i++)
> +    {
> +      struct thr_arc4random_arg arg = { .val = sizes[i] };
> +      pthread_t thr = xpthread_create (NULL, thr_arc4random_throughput, &arg);
> +      xpthread_join (thr);
> +      json_element_double (json_ctx, arg.ret);
> +    }
> +  json_array_end (json_ctx);
> +
> +  json_array_begin (json_ctx, "latency");
> +  for (int i = 0; i < array_length (sizes); i++)
> +    {
> +      struct thr_arc4random_arg arg = { .val = sizes[i] };
> +      pthread_t thr = xpthread_create (NULL, thr_arc4random_latency, &arg);
> +      xpthread_join (thr);
> +      json_element_double (json_ctx, arg.ret);
> +    }
> +  json_array_end (json_ctx);
> +
> +  json_element_object_end (json_ctx);
> +}
> +
> +static void
> +run_bench (json_ctx_t *json_ctx, const char *name,
> +          char *const*fnames, size_t fnameslen,
> +          void (*bench)(json_ctx_t *ctx))
> +{
> +  json_attr_object_begin (json_ctx, name);
> +  json_array_begin (json_ctx, "functions");
> +  for (int i = 0; i < fnameslen; i++)
> +    json_element_string (json_ctx, fnames[i]);
> +  json_array_end (json_ctx);
> +
> +  json_array_begin (json_ctx, "results");
> +  bench (json_ctx);
> +  json_array_end (json_ctx);
> +  json_attr_object_end (json_ctx);
> +}
> +
> +static int
> +do_test (void)
> +{
> +  char *fnames[array_length (sizes) + 1];
> +  fnames[0] = (char *) "arc4random";
> +  for (int i = 0; i < array_length (sizes); i++)
> +    fnames[i+1] = xasprintf ("arc4random_buf(%u)", sizes[i]);
> +
> +  json_ctx_t json_ctx;
> +  json_init (&json_ctx, 0, stdout);
> +
> +  json_document_begin (&json_ctx);
> +  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
> +
> +  run_bench (&json_ctx, "single-thread", fnames, array_length (fnames),
> +            bench_singlethread);
> +  run_bench (&json_ctx, "multi-thread", fnames, array_length (fnames),
> +            bench_threaded);
> +
> +  json_document_end (&json_ctx);
> +
> +  for (int i = 0; i < array_length (sizes); i++)
> +    free (fnames[i+1]);
> +
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> --
> 2.32.0
>
Adhemerval Zanella Netto April 14, 2022, 7:48 p.m. UTC | #2
On 14/04/2022 16:17, Noah Goldstein wrote:
> On Wed, Apr 13, 2022 at 3:26 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> It shows both throughput (total bytes obtained in the test duration)
>> and latecy for both arc4random and arc4random_buf with different
>> sizes.
>>
>> +
>> +static void *
>> +thr_arc4random_latency (void *closure)
>> +{
>> +  struct thr_arc4random_arg *arg = closure;
>> +  arg->ret = arg->val == 0 ? bench_arc4random_latency ()
>> +                          : bench_arc4random_buf_latency (arg->val);
>> +  return NULL;
>> +}
> 
> I think the expectation is that the chacha calls will be cold,
> maybe it is worth adding a cache flush of sorts between
> calls. It may be some prefetching in the start will help the code in
> that case but would only be a regression with the hot in L1
> benchmarks.
> 
> Can wait though this V1 looks fine.

In fact I think just checking the call within a thread does not add
much, specially since we don't have any single-thread lock optimization
for internal locks.  I will remove it on v2 and maybe revise it in the
future.
Noah Goldstein April 14, 2022, 8:33 p.m. UTC | #3
On Thu, Apr 14, 2022 at 2:48 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 14/04/2022 16:17, Noah Goldstein wrote:
> > On Wed, Apr 13, 2022 at 3:26 PM Adhemerval Zanella via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >> It shows both throughput (total bytes obtained in the test duration)
> >> and latecy for both arc4random and arc4random_buf with different
> >> sizes.
> >>
> >> +
> >> +static void *
> >> +thr_arc4random_latency (void *closure)
> >> +{
> >> +  struct thr_arc4random_arg *arg = closure;
> >> +  arg->ret = arg->val == 0 ? bench_arc4random_latency ()
> >> +                          : bench_arc4random_buf_latency (arg->val);
> >> +  return NULL;
> >> +}
> >
> > I think the expectation is that the chacha calls will be cold,
> > maybe it is worth adding a cache flush of sorts between
> > calls. It may be some prefetching in the start will help the code in
> > that case but would only be a regression with the hot in L1
> > benchmarks.
> >
> > Can wait though this V1 looks fine.
>
> In fact I think just checking the call within a thread does not add
> much, specially since we don't have any single-thread lock optimization
> for internal locks.  I will remove it on v2 and maybe revise it in the
> future.

What do you mean single-thread lock optimization?
Adhemerval Zanella Netto April 14, 2022, 8:48 p.m. UTC | #4
On 14/04/2022 17:33, Noah Goldstein wrote:
> On Thu, Apr 14, 2022 at 2:48 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 14/04/2022 16:17, Noah Goldstein wrote:
>>> On Wed, Apr 13, 2022 at 3:26 PM Adhemerval Zanella via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>>
>>>> It shows both throughput (total bytes obtained in the test duration)
>>>> and latecy for both arc4random and arc4random_buf with different
>>>> sizes.
>>>>
>>>> +
>>>> +static void *
>>>> +thr_arc4random_latency (void *closure)
>>>> +{
>>>> +  struct thr_arc4random_arg *arg = closure;
>>>> +  arg->ret = arg->val == 0 ? bench_arc4random_latency ()
>>>> +                          : bench_arc4random_buf_latency (arg->val);
>>>> +  return NULL;
>>>> +}
>>>
>>> I think the expectation is that the chacha calls will be cold,
>>> maybe it is worth adding a cache flush of sorts between
>>> calls. It may be some prefetching in the start will help the code in
>>> that case but would only be a regression with the hot in L1
>>> benchmarks.
>>>
>>> Can wait though this V1 looks fine.
>>
>> In fact I think just checking the call within a thread does not add
>> much, specially since we don't have any single-thread lock optimization
>> for internal locks.  I will remove it on v2 and maybe revise it in the
>> future.
> 
> What do you mean single-thread lock optimization?

Not take the lock if process is single-threaded, as we do on some fast-path
in malloc code.
diff mbox series

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8dfca592fd..50b96dd71f 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -111,8 +111,12 @@  bench-string := \
   ffsll \
 # bench-string
 
+bench-stdlib := \
+  arc4random \
+# bench-stdlib
+
 ifeq (${BENCHSET},)
-bench := $(bench-math) $(bench-pthread) $(bench-string)
+bench := $(bench-math) $(bench-pthread) $(bench-string) $(bench-stdlib)
 else
 bench := $(foreach B,$(filter bench-%,${BENCHSET}), ${${B}})
 endif
diff --git a/benchtests/bench-arc4random.c b/benchtests/bench-arc4random.c
new file mode 100644
index 0000000000..9e2ba9ba34
--- /dev/null
+++ b/benchtests/bench-arc4random.c
@@ -0,0 +1,243 @@ 
+/* arc4random benchmarks.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "bench-timing.h"
+#include "json-lib.h"
+#include <array_length.h>
+#include <intprops.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <support/support.h>
+#include <support/xthread.h>
+
+static volatile uint32_t r;
+static volatile sig_atomic_t timer_finished;
+
+static void timer_callback (int unused)
+{
+  timer_finished = 1;
+}
+
+static const uint32_t sizes[] = { 0, 16, 32, 64, 128 };
+
+static double
+bench_arc4random_throughput (void)
+{
+  /* Run for approximately DURATION seconds, and it does not matter who
+     receive the signal (so not need to mask it on main thread).  */
+  timer_finished = 0;
+  timer_t timer = support_create_timer (DURATION, 0, false, timer_callback);
+
+  uint64_t n = 0;
+
+  while (1)
+    {
+      r = arc4random ();
+      n++;
+
+      if (timer_finished == 1)
+	break;
+    }
+
+  support_delete_timer (timer);
+
+  return (double) (n * sizeof (r)) / (double) DURATION;
+}
+
+static double
+bench_arc4random_latency (void)
+{
+  timing_t start, stop, cur;
+  const size_t iters = 1024;
+
+  TIMING_NOW (start);
+  for (size_t i = 0; i < iters; i++)
+    r = arc4random ();
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  return (double) (cur) / (double) iters;
+}
+
+static double
+bench_arc4random_buf_throughput (size_t len)
+{
+  timer_finished = 0;
+  timer_t timer = support_create_timer (DURATION, 0, false, timer_callback);
+
+  uint8_t buf[len];
+
+  uint64_t n = 0;
+
+  while (1)
+    {
+      arc4random_buf (buf, len);
+      n++;
+
+      if (timer_finished == 1)
+	break;
+    }
+
+  support_delete_timer (timer);
+
+  uint64_t total = (n * len);
+  return (double) (total) / (double) DURATION;
+}
+
+static double
+bench_arc4random_buf_latency (size_t len)
+{
+  timing_t start, stop, cur;
+  const size_t iters = 1024;
+
+  uint8_t buf[len];
+
+  TIMING_NOW (start);
+  for (size_t i = 0; i < iters; i++)
+    arc4random_buf (buf, len);
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  return (double) (cur) / (double) iters;
+}
+
+static void
+bench_singlethread (json_ctx_t *json_ctx)
+{
+  json_element_object_begin (json_ctx);
+
+  json_array_begin (json_ctx, "throughput");
+  for (int i = 0; i < array_length (sizes); i++)
+    if (sizes[i] == 0)
+      json_element_double (json_ctx, bench_arc4random_throughput ());
+    else
+      json_element_double (json_ctx, bench_arc4random_buf_throughput (sizes[i]));
+  json_array_end (json_ctx);
+
+  json_array_begin (json_ctx, "latency");
+  for (int i = 0; i < array_length (sizes); i++)
+    if (sizes[i] == 0)
+      json_element_double (json_ctx, bench_arc4random_latency ());
+    else
+      json_element_double (json_ctx, bench_arc4random_buf_latency (sizes[i]));
+  json_array_end (json_ctx);
+
+  json_element_object_end (json_ctx);
+}
+
+struct thr_arc4random_arg
+{
+  double ret;
+  uint32_t val;
+};
+
+static void *
+thr_arc4random_throughput (void *closure)
+{
+  struct thr_arc4random_arg *arg = closure;
+  arg->ret = arg->val == 0 ? bench_arc4random_throughput ()
+			   : bench_arc4random_buf_throughput (arg->val);
+  return NULL;
+}
+
+static void *
+thr_arc4random_latency (void *closure)
+{
+  struct thr_arc4random_arg *arg = closure;
+  arg->ret = arg->val == 0 ? bench_arc4random_latency ()
+			   : bench_arc4random_buf_latency (arg->val);
+  return NULL;
+}
+
+static void
+bench_threaded (json_ctx_t *json_ctx)
+{
+  json_element_object_begin (json_ctx);
+
+  json_array_begin (json_ctx, "throughput");
+  for (int i = 0; i < array_length (sizes); i++)
+    {
+      struct thr_arc4random_arg arg = { .val = sizes[i] };
+      pthread_t thr = xpthread_create (NULL, thr_arc4random_throughput, &arg);
+      xpthread_join (thr);
+      json_element_double (json_ctx, arg.ret);
+    }
+  json_array_end (json_ctx);
+
+  json_array_begin (json_ctx, "latency");
+  for (int i = 0; i < array_length (sizes); i++)
+    {
+      struct thr_arc4random_arg arg = { .val = sizes[i] };
+      pthread_t thr = xpthread_create (NULL, thr_arc4random_latency, &arg);
+      xpthread_join (thr);
+      json_element_double (json_ctx, arg.ret);
+    }
+  json_array_end (json_ctx);
+
+  json_element_object_end (json_ctx);
+}
+
+static void
+run_bench (json_ctx_t *json_ctx, const char *name,
+	   char *const*fnames, size_t fnameslen,
+	   void (*bench)(json_ctx_t *ctx))
+{
+  json_attr_object_begin (json_ctx, name);
+  json_array_begin (json_ctx, "functions");
+  for (int i = 0; i < fnameslen; i++)
+    json_element_string (json_ctx, fnames[i]);
+  json_array_end (json_ctx);
+
+  json_array_begin (json_ctx, "results");
+  bench (json_ctx);
+  json_array_end (json_ctx);
+  json_attr_object_end (json_ctx);
+}
+
+static int
+do_test (void)
+{
+  char *fnames[array_length (sizes) + 1];
+  fnames[0] = (char *) "arc4random";
+  for (int i = 0; i < array_length (sizes); i++)
+    fnames[i+1] = xasprintf ("arc4random_buf(%u)", sizes[i]);
+
+  json_ctx_t json_ctx;
+  json_init (&json_ctx, 0, stdout);
+
+  json_document_begin (&json_ctx);
+  json_attr_string (&json_ctx, "timing_type", TIMING_TYPE);
+
+  run_bench (&json_ctx, "single-thread", fnames, array_length (fnames),
+	     bench_singlethread);
+  run_bench (&json_ctx, "multi-thread", fnames, array_length (fnames),
+	     bench_threaded);
+
+  json_document_end (&json_ctx);
+
+  for (int i = 0; i < array_length (sizes); i++)
+    free (fnames[i+1]);
+
+  return 0;
+}
+
+#include <support/test-driver.c>