diff mbox series

[v3,9/9] stdlib: Add TLS optimization to arc4random

Message ID 20220419212812.2688764-10-adhemerval.zanella@linaro.org
State New
Headers show
Series Add arc4random support | expand

Commit Message

Adhemerval Zanella Netto April 19, 2022, 9:28 p.m. UTC
The arc4random state is moved to TCB, so there is no allocation
failure.  It adds about 592 bytes struct pthread.

Now that the state is thread private within a shared struct, the
 MADV_WIPEONFORK usage is removed.  The cipher state reset is done
 solely by the atfork internal handler.

The state is also cleared on thread exit iff it was initialized (so if
arc4random is not called it is not touched).

Although it is lock-free, arc4random is still not async-signal-safe
(the per thread state is not updated atomically).

On x86_64 using AVX2 it shows a slight better performance:

From
--------------------------------------------------
arc4random [single-thread]               809.53
arc4random_buf(16) [single-thread]       1242.56
arc4random_buf(32) [single-thread]       1915.90
arc4random_buf(48) [single-thread]       2230.03
arc4random_buf(64) [single-thread]       2429.68
arc4random_buf(80) [single-thread]       2489.70
arc4random_buf(96) [single-thread]       2598.88
arc4random_buf(112) [single-thread]      2699.93
arc4random_buf(128) [single-thread]      2747.31

To                                       MB/s
--------------------------------------------------
arc4random [single-thread]               941.54
arc4random_buf(16) [single-thread]       1409.39
arc4random_buf(32) [single-thread]       2056.17
arc4random_buf(48) [single-thread]       2367.13
arc4random_buf(64) [single-thread]       2551.44
arc4random_buf(80) [single-thread]       2601.38
arc4random_buf(96) [single-thread]       2710.21
arc4random_buf(112) [single-thread]      2797.86
arc4random_buf(128) [single-thread]      2846.12
--------------------------------------------------

However it shows a large speed up specially on architecture with
most costly atomics.  For instance, on a aarch64 Neoverse N1:

From                                     MB/s
--------------------------------------------------
arc4random [single-thread]               154.98
arc4random_buf(16) [single-thread]       342.63
arc4random_buf(32) [single-thread]       485.91
arc4random_buf(48) [single-thread]       539.95
arc4random_buf(64) [single-thread]       593.38
arc4random_buf(80) [single-thread]       629.45
arc4random_buf(96) [single-thread]       655.78
arc4random_buf(112) [single-thread]      670.54
arc4random_buf(128) [single-thread]      681.65
--------------------------------------------------

To                                       MB/s
--------------------------------------------------
arc4random [single-thread]               335.94
arc4random_buf(16) [single-thread]       498.69
arc4random_buf(32) [single-thread]       612.24
arc4random_buf(48) [single-thread]       655.77
arc4random_buf(64) [single-thread]       691.97
arc4random_buf(80) [single-thread]       701.68
arc4random_buf(96) [single-thread]       710.35
arc4random_buf(112) [single-thread]      714.23
arc4random_buf(128) [single-thread]      722.13
--------------------------------------------------

Checked on x86_64-linux-gnu.
---
 nptl/allocatestack.c                   |   5 +-
 stdlib/arc4random.c                    | 137 +++++++------------------
 stdlib/arc4random.h                    |  45 ++++++++
 stdlib/arc4random_uniform.c            |   8 +-
 stdlib/chacha20.c                      |   3 -
 stdlib/tst-arc4random-chacha20.c       |   2 +-
 sysdeps/generic/tls-internal-struct.h  |   3 +
 sysdeps/unix/sysv/linux/tls-internal.h |  27 ++++-
 8 files changed, 115 insertions(+), 115 deletions(-)
 create mode 100644 stdlib/arc4random.h

Comments

Yann Droneaud April 22, 2022, 4:02 p.m. UTC | #1
Le 19/04/2022 à 23:28, Adhemerval Zanella via Libc-alpha a écrit :
> The arc4random state is moved to TCB, so there is no allocation
> failure.  It adds about 592 bytes struct pthread.

+to struct pthread ?


>
> Now that the state is thread private within a shared struct, the
>   MADV_WIPEONFORK usage is removed.  The cipher state reset is done
>   solely by the atfork internal handler.
>
> The state is also cleared on thread exit iff it was initialized (so if
> arc4random is not called it is not touched).
>
> Although it is lock-free, arc4random is still not async-signal-safe
> (the per thread state is not updated atomically).
>
> On x86_64 using AVX2 it shows a slight better performance:
>
> From
> --------------------------------------------------
> arc4random [single-thread]               809.53
> arc4random_buf(16) [single-thread]       1242.56
> arc4random_buf(32) [single-thread]       1915.90
> arc4random_buf(48) [single-thread]       2230.03
> arc4random_buf(64) [single-thread]       2429.68
> arc4random_buf(80) [single-thread]       2489.70
> arc4random_buf(96) [single-thread]       2598.88
> arc4random_buf(112) [single-thread]      2699.93
> arc4random_buf(128) [single-thread]      2747.31
>
> To                                       MB/s
> --------------------------------------------------
> arc4random [single-thread]               941.54
> arc4random_buf(16) [single-thread]       1409.39
> arc4random_buf(32) [single-thread]       2056.17
> arc4random_buf(48) [single-thread]       2367.13
> arc4random_buf(64) [single-thread]       2551.44
> arc4random_buf(80) [single-thread]       2601.38
> arc4random_buf(96) [single-thread]       2710.21
> arc4random_buf(112) [single-thread]      2797.86
> arc4random_buf(128) [single-thread]      2846.12
> --------------------------------------------------
>
> However it shows a large speed up specially on architecture with
> most costly atomics.  For instance, on a aarch64 Neoverse N1:
>
>  From                                     MB/s
> --------------------------------------------------
> arc4random [single-thread]               154.98
> arc4random_buf(16) [single-thread]       342.63
> arc4random_buf(32) [single-thread]       485.91
> arc4random_buf(48) [single-thread]       539.95
> arc4random_buf(64) [single-thread]       593.38
> arc4random_buf(80) [single-thread]       629.45
> arc4random_buf(96) [single-thread]       655.78
> arc4random_buf(112) [single-thread]      670.54
> arc4random_buf(128) [single-thread]      681.65
> --------------------------------------------------
>
> To                                       MB/s
> --------------------------------------------------
> arc4random [single-thread]               335.94
> arc4random_buf(16) [single-thread]       498.69
> arc4random_buf(32) [single-thread]       612.24
> arc4random_buf(48) [single-thread]       655.77
> arc4random_buf(64) [single-thread]       691.97
> arc4random_buf(80) [single-thread]       701.68
> arc4random_buf(96) [single-thread]       710.35
> arc4random_buf(112) [single-thread]      714.23
> arc4random_buf(128) [single-thread]      722.13
> --------------------------------------------------
>
> Checked on x86_64-linux-gnu.
> ---
>   nptl/allocatestack.c                   |   5 +-
>   stdlib/arc4random.c                    | 137 +++++++------------------
>   stdlib/arc4random.h                    |  45 ++++++++
>   stdlib/arc4random_uniform.c            |   8 +-
>   stdlib/chacha20.c                      |   3 -
>   stdlib/tst-arc4random-chacha20.c       |   2 +-
>   sysdeps/generic/tls-internal-struct.h  |   3 +
>   sysdeps/unix/sysv/linux/tls-internal.h |  27 ++++-
>   8 files changed, 115 insertions(+), 115 deletions(-)
>   create mode 100644 stdlib/arc4random.h
>
> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index 01a282f3f6..ada65d40c2 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -32,6 +32,7 @@
>   #include <kernel-features.h>
>   #include <nptl-stack.h>
>   #include <libc-lock.h>
> +#include <tls-internal.h>
>   
>   /* Default alignment of stack.  */
>   #ifndef STACK_ALIGN
> @@ -127,7 +128,7 @@ get_cached_stack (size_t *sizep, void **memp)
>   
>     result->exiting = false;
>     __libc_lock_init (result->exit_lock);
> -  result->tls_state = (struct tls_internal_t) { 0 };
> +  __glibc_tls_internal_init (&result->tls_state);
>   
>     /* Clear the DTV.  */
>     dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
> @@ -559,6 +560,8 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>   #endif
>     pd->robust_head.list = &pd->robust_head;
>   
> +  __glibc_tls_internal_init (&pd->tls_state);
> +
>     /* We place the thread descriptor at the end of the stack.  */
>     *pdp = pd;
>   
> diff --git a/stdlib/arc4random.c b/stdlib/arc4random.c
> index cddb0e405a..6144275c08 100644
> --- a/stdlib/arc4random.c
> +++ b/stdlib/arc4random.c
> @@ -16,14 +16,15 @@
>      License along with the GNU C Library; if not, see
>      <http://www.gnu.org/licenses/>.  */
>   
> +#include <arc4random.h>
>   #include <errno.h>
> -#include <libc-lock.h>
>   #include <not-cancel.h>
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <sys/mman.h>
>   #include <sys/param.h>
>   #include <sys/random.h>
> +#include <tls-internal.h>
>   
>   /* Besides the cipher state 'ctx', it keeps two counters: 'have' is the
>      current valid bytes not yet consumed in 'buf', while 'count' is the maximum
> @@ -37,42 +38,16 @@
>      arc4random calls (since only multiple call it will encrypt the next block).
>    */
>   
> -/* Maximum number bytes until reseed (16 MB).  */
> -#define CHACHE_RESEED_SIZE	(16 * 1024 * 1024)
> -/* Internal buffer size in bytes (1KB).  */
> -#define CHACHA20_BUFSIZE        (8 * CHACHA20_BLOCK_SIZE)
> -
>   #include <chacha20.c>
>   
> -static struct arc4random_state
> -{
> -  uint32_t ctx[CHACHA20_STATE_LEN];
> -  size_t have;
> -  size_t count;
> -  uint8_t buf[CHACHA20_BUFSIZE];
> -} *state;
> -
> -/* Indicate that MADV_WIPEONFORK is supported by the kernel and thus
> -   it does not require to clear the internal state.  */
> -static bool __arc4random_wipeonfork = false;
> -
> -__libc_lock_define_initialized (, __arc4random_lock);
> -
> -/* Called from the fork function to reset the state if MADV_WIPEONFORK is
> -   not supported and to reinit the internal lock.  */
> +/* Called from the fork function to reset the state.  */
>   void
>   __arc4random_fork_subprocess (void)
>   {
> -  if (__arc4random_wipeonfork && state != NULL)
> -    memset (state, 0, sizeof (struct arc4random_state));
> -
> -  __libc_lock_init (__arc4random_lock);
> -}
> -
> -static void
> -arc4random_allocate_failure (void)
> -{
> -  __libc_fatal ("Fatal glibc error: Cannot allocate memory for arc4random\n");
> +  struct arc4random_state *state = &__glibc_tls_internal()->rnd_state;
> +  memset (state, 0, sizeof (struct arc4random_state));
> +  /* Force key init.  */
> +  state->count = -1;
>   }
>   
>   static void
> @@ -81,33 +56,10 @@ arc4random_getrandom_failure (void)
>     __libc_fatal ("Fatal glibc error: Cannot get entropy for arc4random\n");
>   }
>   
> -/* Fork detection is done by checking if MADV_WIPEONFORK supported.  If not
> -   the fork callback will reset the state on the fork call.  It does not
> -   handle direct clone calls, nor vfork or _Fork (arc4random is not
> -   async-signal-safe due the internal lock usage).  */
> -static void
> -arc4random_init (uint8_t *buf, size_t len)
> -{
> -  state = __mmap (NULL, sizeof (struct arc4random_state),
> -		  PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> -  if (state == MAP_FAILED)
> -    arc4random_allocate_failure ();
> -
> -#ifdef MADV_WIPEONFORK
> -  int r = __madvise (state, sizeof (struct arc4random_state), MADV_WIPEONFORK);
> -  if (r == 0)
> -    __arc4random_wipeonfork = true;
> -  else if (errno != EINVAL)
> -    arc4random_allocate_failure ();
> -#endif
> -
> -  chacha20_init (state->ctx, buf, buf + CHACHA20_KEY_SIZE);
> -}
> -
>   #define min(x,y) (((x) > (y)) ? (y) : (x))
>   
>   static void
> -arc4random_rekey (uint8_t *rnd, size_t rndlen)
> +arc4random_rekey (struct arc4random_state *state, uint8_t *rnd, size_t rndlen)
>   {
>     memset (state->buf, 0, sizeof state->buf);
>     chacha20_crypt (state->ctx, state->buf, state->buf, sizeof state->buf);
> @@ -152,41 +104,41 @@ arc4random_getentropy (uint8_t *rnd, size_t len)
>     arc4random_getrandom_failure ();
>   }
>   
> -/* Either allocates the state buffer or reinit it by reseeding the cipher
> -   state with kernel entropy.  */
> -static void
> -arc4random_stir (void)
> +/* Reinit the thread context by reseeding the cipher state with kernel
> +   entropy.  */
> +static struct arc4random_state *
> +arc4random_check_stir (size_t len)
>   {
> -  uint8_t rnd[CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE];
> -  arc4random_getentropy (rnd, sizeof rnd);
> +  struct arc4random_state *state = &__glibc_tls_internal()->rnd_state;
>   
> -  if (state == NULL)
> -    arc4random_init (rnd, sizeof rnd);
> -  else
> -    arc4random_rekey (rnd, sizeof rnd);
> +  if (state->count < len || state->count == -1)
> +    {
> +      uint8_t rnd[CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE];
> +      arc4random_getentropy (rnd, sizeof rnd);
>   
> -  explicit_bzero (rnd, sizeof rnd);
> +      if (state->count > CHACHE_RESEED_SIZE)
> +	chacha20_init (state->ctx, rnd, rnd + CHACHA20_KEY_SIZE);

for case state->count == -1, chacha20_init() should be called (first) instead of arc4random_rekey()
as chacha20 context is not setup and the buffer contains no keystream yet

     if (state->count == -1)
         chacha20_init (state->ctx, rnd, rnd + CHACHA20_KEY_SIZE);


> +      else
> +	arc4random_rekey (state, rnd, sizeof rnd);
>   
> -  state->have = 0;
> -  memset (state->buf, 0, sizeof state->buf);
> -  state->count = CHACHE_RESEED_SIZE;
> -}
> +      explicit_bzero (rnd, sizeof rnd);
>   
> -static void
> -arc4random_check_stir (size_t len)
> -{
> -  if (state == NULL || state->count < len)
> -    arc4random_stir ();
> +      state->have = 0;
> +      memset (state->buf, 0, sizeof state->buf);
> +      state->count = CHACHE_RESEED_SIZE;
> +    }
>     if (state->count <= len)
>       state->count = 0;
>     else
>       state->count -= len;
> +
> +  return state;
>   }
>   
>   void
> -__arc4random_buf_internal (void *buffer, size_t len)
> +__arc4random_buf (void *buffer, size_t len)
>   {
> -  arc4random_check_stir (len);
> +  struct arc4random_state *state = arc4random_check_stir (len);
>   
>     while (len > 0)
>       {
> @@ -201,29 +153,20 @@ __arc4random_buf_internal (void *buffer, size_t len)
>   	  state->have -= m;
>   	}
>         if (state->have == 0)
> -	arc4random_rekey (NULL, 0);
> +	arc4random_rekey (state, NULL, 0);
>       }
>   }
> -
> -void
> -__arc4random_buf (void *buffer, size_t len)
> -{
> -  __libc_lock_lock (__arc4random_lock);
> -  __arc4random_buf_internal (buffer, len);
> -  __libc_lock_unlock (__arc4random_lock);
> -}
>   libc_hidden_def (__arc4random_buf)
>   weak_alias (__arc4random_buf, arc4random_buf)
>   
> -
> -static uint32_t
> -__arc4random_internal (void)
> +uint32_t
> +__arc4random (void)
>   {
>     uint32_t r;
>   
> -  arc4random_check_stir (sizeof (uint32_t));
> +  struct arc4random_state *state = arc4random_check_stir (sizeof (uint32_t));
>     if (state->have < sizeof (uint32_t))
> -    arc4random_rekey (NULL, 0);
> +    arc4random_rekey (state, NULL, 0);
>     uint8_t *ks = state->buf + sizeof (state->buf) - state->have;
>     memcpy (&r, ks, sizeof (uint32_t));
>     memset (ks, 0, sizeof (uint32_t));
> @@ -231,15 +174,5 @@ __arc4random_internal (void)
>   
>     return r;
>   }
> -
> -uint32_t
> -__arc4random (void)
> -{
> -  uint32_t r;
> -  __libc_lock_lock (__arc4random_lock);
> -  r = __arc4random_internal ();
> -  __libc_lock_unlock (__arc4random_lock);
> -  return r;
> -}
>   libc_hidden_def (__arc4random)
>   weak_alias (__arc4random, arc4random)
> diff --git a/stdlib/arc4random.h b/stdlib/arc4random.h
> new file mode 100644
> index 0000000000..40672299d0
> --- /dev/null
> +++ b/stdlib/arc4random.h
> @@ -0,0 +1,45 @@
> +/* Arc4random definition used on TLS.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#ifndef _CHACHA20_H
> +#define _CHACHA20_H
> +
> +#include <stddef.h>
> +#include <stdint.h>
> +
> +/* Internal ChaCha20 state.  */
> +#define CHACHA20_STATE_LEN	16
> +#define CHACHA20_BLOCK_SIZE	64
> +
> +/* Maximum number bytes until reseed (16 MB).  */
> +#define CHACHE_RESEED_SIZE	(16 * 1024 * 1024)
> +
> +/* Internal arc4random buffer, used on each feedback step so offer some
> +   backtracking protection and to allow better used of vectorized
> +   chacha20 implementations.  */
> +#define CHACHA20_BUFSIZE        (8 * CHACHA20_BLOCK_SIZE)
> +
> +struct arc4random_state
> +{
> +  uint32_t ctx[CHACHA20_STATE_LEN];
> +  size_t have;
> +  size_t count;
> +  uint8_t buf[CHACHA20_BUFSIZE];
> +};
> +
> +#endif
> diff --git a/stdlib/arc4random_uniform.c b/stdlib/arc4random_uniform.c
> index 96ffe62df1..7d0140c375 100644
> --- a/stdlib/arc4random_uniform.c
> +++ b/stdlib/arc4random_uniform.c
> @@ -46,7 +46,7 @@ random_bytes (uint32_t *result, uint32_t byte_count)
>     unsigned char *ptr = (unsigned char *) result;
>     if (__BYTE_ORDER == __BIG_ENDIAN)
>       ptr += 4 - byte_count;
> -  __arc4random_buf_internal (ptr, byte_count);
> +  __arc4random_buf (ptr, byte_count);
>   }
>   
>   static uint32_t
> @@ -142,11 +142,7 @@ __libc_lock_define (extern , __arc4random_lock attribute_hidden)
>   uint32_t
>   __arc4random_uniform (uint32_t upper_bound)
>   {
> -  uint32_t r;
> -  __libc_lock_lock (__arc4random_lock);
> -  r = compute_uniform (upper_bound);
> -  __libc_lock_unlock (__arc4random_lock);
> -  return r;
> +  return compute_uniform (upper_bound);
>   }
>   libc_hidden_def (__arc4random_uniform)
>   weak_alias (__arc4random_uniform, arc4random_uniform)
> diff --git a/stdlib/chacha20.c b/stdlib/chacha20.c
> index fea4994169..0fb55c0fa3 100644
> --- a/stdlib/chacha20.c
> +++ b/stdlib/chacha20.c
> @@ -26,11 +26,8 @@
>   #define CHACHA20_IV_SIZE	16
>   #define CHACHA20_KEY_SIZE	32
>   
> -#define CHACHA20_BLOCK_SIZE     64
>   #define CHACHA20_BLOCK_WORDS    (CHACHA20_BLOCK_SIZE / sizeof (uint32_t))
>   
> -#define CHACHA20_STATE_LEN	16
> -
>   /* Defining CHACHA20_XOR_FINAL issues the final XOR using the input as defined
>      Sby RFC8439.  Since the input stream will either zero bytes (initial state)
>      or the PRNG output itself this step does not add any extra entropy.   */
> diff --git a/stdlib/tst-arc4random-chacha20.c b/stdlib/tst-arc4random-chacha20.c
> index dd0ef6d8ba..614e6e0736 100644
> --- a/stdlib/tst-arc4random-chacha20.c
> +++ b/stdlib/tst-arc4random-chacha20.c
> @@ -16,11 +16,11 @@
>      License along with the GNU C Library; if not, see
>      <http://www.gnu.org/licenses/>.  */
>   
> +#include <arc4random.h>
>   #include <support/check.h>
>   #include <sys/cdefs.h>
>   
>   /* It does not define CHACHA20_XOR_FINAL to check what glibc actual uses. */
> -#define CHACHA20_BUFSIZE        (8 * CHACHA20_BLOCK_SIZE)
>   #include <chacha20.c>
>   
>   static int
> diff --git a/sysdeps/generic/tls-internal-struct.h b/sysdeps/generic/tls-internal-struct.h
> index d76c715a96..5d0e2fba53 100644
> --- a/sysdeps/generic/tls-internal-struct.h
> +++ b/sysdeps/generic/tls-internal-struct.h
> @@ -19,10 +19,13 @@
>   #ifndef _TLS_INTERNAL_STRUCT_H
>   #define _TLS_INTERNAL_STRUCT_H 1
>   
> +#include <stdlib/arc4random.h>
> +
>   struct tls_internal_t
>   {
>     char *strsignal_buf;
>     char *strerror_l_buf;
> +  struct arc4random_state rnd_state;
>   };
>   
>   #endif
> diff --git a/sysdeps/unix/sysv/linux/tls-internal.h b/sysdeps/unix/sysv/linux/tls-internal.h
> index f7a1a62135..16ff836d05 100644
> --- a/sysdeps/unix/sysv/linux/tls-internal.h
> +++ b/sysdeps/unix/sysv/linux/tls-internal.h
> @@ -22,6 +22,19 @@
>   #include <stdlib.h>
>   #include <pthreadP.h>
>   
> +static inline void
> +__glibc_tls_internal_init (struct tls_internal_t *tls_state)
> +{
> +  tls_state->strsignal_buf = NULL;
> +  tls_state->strerror_l_buf = NULL;
> +
> +  /* Force key init on created threads.  There is no need to clear the
> +     initial state since it will be done either by allocation a new
> +     stack (through mmap with MAP_ANONYMOUS) or by the free function
> +     below).  */
> +  tls_state->rnd_state.count = -1;
> +}
> +
>   static inline struct tls_internal_t *
>   __glibc_tls_internal (void)
>   {
> @@ -31,8 +44,18 @@ __glibc_tls_internal (void)
>   static inline void
>   __glibc_tls_internal_free (void)
>   {
> -  free (THREAD_SELF->tls_state.strsignal_buf);
> -  free (THREAD_SELF->tls_state.strerror_l_buf);
> +  struct pthread *self = THREAD_SELF;
> +  free (self->tls_state.strsignal_buf);
> +  free (self->tls_state.strerror_l_buf);
> +  if (self->tls_state.rnd_state.count != -1)
> +    {
> +      /* Clear any lingering random state prior so if the thread stack
> +	 is cached it won't leak any data.  */
> +      memset (&self->tls_state.rnd_state, 0,
> +	      sizeof self->tls_state.rnd_state);
> +      /* Force key init on created threads.  */
> +      self->tls_state.rnd_state.count = -1;

setting to -1 is probably not needed, as it will be set by the init 
function.


> +    }
>   }
>   
>   #endif
Adhemerval Zanella Netto April 25, 2022, 12:36 p.m. UTC | #2
On 22/04/2022 13:02, Yann Droneaud wrote:
> Le 19/04/2022 à 23:28, Adhemerval Zanella via Libc-alpha a écrit :
>> The arc4random state is moved to TCB, so there is no allocation
>> failure.  It adds about 592 bytes struct pthread.
> 
> +to struct pthread ?

Ack.


>> +/* Reinit the thread context by reseeding the cipher state with kernel
>> +   entropy.  */
>> +static struct arc4random_state *
>> +arc4random_check_stir (size_t len)
>>  {
>> -  uint8_t rnd[CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE];
>> -  arc4random_getentropy (rnd, sizeof rnd);
>> +  struct arc4random_state *state = &__glibc_tls_internal()->rnd_state;
>>  
>> -  if (state == NULL)
>> -    arc4random_init (rnd, sizeof rnd);
>> -  else
>> -    arc4random_rekey (rnd, sizeof rnd);
>> +  if (state->count < len || state->count == -1)
>> +    {
>> +      uint8_t rnd[CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE];
>> +      arc4random_getentropy (rnd, sizeof rnd);
>>  
>> -  explicit_bzero (rnd, sizeof rnd);
>> +      if (state->count > CHACHE_RESEED_SIZE)
>> +	chacha20_init (state->ctx, rnd, rnd + CHACHA20_KEY_SIZE);
> 
> for case state->count == -1, chacha20_init() should be called (first) instead of arc4random_rekey()
> as chacha20 context is not setup and the buffer contains no keystream yet 
> 
>     if (state->count == -1)
>         chacha20_init (state->ctx, rnd, rnd + CHACHA20_KEY_SIZE);
> 
> 

Indeed, I forgot to change it. 


>>  static inline struct tls_internal_t *
>>  __glibc_tls_internal (void)
>>  {
>> @@ -31,8 +44,18 @@ __glibc_tls_internal (void)
>>  static inline void
>>  __glibc_tls_internal_free (void)
>>  {
>> -  free (THREAD_SELF->tls_state.strsignal_buf);
>> -  free (THREAD_SELF->tls_state.strerror_l_buf);
>> +  struct pthread *self = THREAD_SELF;
>> +  free (self->tls_state.strsignal_buf);
>> +  free (self->tls_state.strerror_l_buf);
>> +  if (self->tls_state.rnd_state.count != -1)
>> +    {
>> +      /* Clear any lingering random state prior so if the thread stack
>> +	 is cached it won't leak any data.  */
>> +      memset (&self->tls_state.rnd_state, 0,
>> +	      sizeof self->tls_state.rnd_state);
>> +      /* Force key init on created threads.  */
>> +      self->tls_state.rnd_state.count = -1;
> 
> setting to -1 is probably not needed, as it will be set by the init function.

Indeed, I removed it.
diff mbox series

Patch

diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 01a282f3f6..ada65d40c2 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -32,6 +32,7 @@ 
 #include <kernel-features.h>
 #include <nptl-stack.h>
 #include <libc-lock.h>
+#include <tls-internal.h>
 
 /* Default alignment of stack.  */
 #ifndef STACK_ALIGN
@@ -127,7 +128,7 @@  get_cached_stack (size_t *sizep, void **memp)
 
   result->exiting = false;
   __libc_lock_init (result->exit_lock);
-  result->tls_state = (struct tls_internal_t) { 0 };
+  __glibc_tls_internal_init (&result->tls_state);
 
   /* Clear the DTV.  */
   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
@@ -559,6 +560,8 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 #endif
   pd->robust_head.list = &pd->robust_head;
 
+  __glibc_tls_internal_init (&pd->tls_state);
+
   /* We place the thread descriptor at the end of the stack.  */
   *pdp = pd;
 
diff --git a/stdlib/arc4random.c b/stdlib/arc4random.c
index cddb0e405a..6144275c08 100644
--- a/stdlib/arc4random.c
+++ b/stdlib/arc4random.c
@@ -16,14 +16,15 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <arc4random.h>
 #include <errno.h>
-#include <libc-lock.h>
 #include <not-cancel.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/mman.h>
 #include <sys/param.h>
 #include <sys/random.h>
+#include <tls-internal.h>
 
 /* Besides the cipher state 'ctx', it keeps two counters: 'have' is the
    current valid bytes not yet consumed in 'buf', while 'count' is the maximum
@@ -37,42 +38,16 @@ 
    arc4random calls (since only multiple call it will encrypt the next block).
  */
 
-/* Maximum number bytes until reseed (16 MB).  */
-#define CHACHE_RESEED_SIZE	(16 * 1024 * 1024)
-/* Internal buffer size in bytes (1KB).  */
-#define CHACHA20_BUFSIZE        (8 * CHACHA20_BLOCK_SIZE)
-
 #include <chacha20.c>
 
-static struct arc4random_state
-{
-  uint32_t ctx[CHACHA20_STATE_LEN];
-  size_t have;
-  size_t count;
-  uint8_t buf[CHACHA20_BUFSIZE];
-} *state;
-
-/* Indicate that MADV_WIPEONFORK is supported by the kernel and thus
-   it does not require to clear the internal state.  */
-static bool __arc4random_wipeonfork = false;
-
-__libc_lock_define_initialized (, __arc4random_lock);
-
-/* Called from the fork function to reset the state if MADV_WIPEONFORK is
-   not supported and to reinit the internal lock.  */
+/* Called from the fork function to reset the state.  */
 void
 __arc4random_fork_subprocess (void)
 {
-  if (__arc4random_wipeonfork && state != NULL)
-    memset (state, 0, sizeof (struct arc4random_state));
-
-  __libc_lock_init (__arc4random_lock);
-}
-
-static void
-arc4random_allocate_failure (void)
-{
-  __libc_fatal ("Fatal glibc error: Cannot allocate memory for arc4random\n");
+  struct arc4random_state *state = &__glibc_tls_internal()->rnd_state;
+  memset (state, 0, sizeof (struct arc4random_state));
+  /* Force key init.  */
+  state->count = -1;
 }
 
 static void
@@ -81,33 +56,10 @@  arc4random_getrandom_failure (void)
   __libc_fatal ("Fatal glibc error: Cannot get entropy for arc4random\n");
 }
 
-/* Fork detection is done by checking if MADV_WIPEONFORK supported.  If not
-   the fork callback will reset the state on the fork call.  It does not
-   handle direct clone calls, nor vfork or _Fork (arc4random is not
-   async-signal-safe due the internal lock usage).  */
-static void
-arc4random_init (uint8_t *buf, size_t len)
-{
-  state = __mmap (NULL, sizeof (struct arc4random_state),
-		  PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-  if (state == MAP_FAILED)
-    arc4random_allocate_failure ();
-
-#ifdef MADV_WIPEONFORK
-  int r = __madvise (state, sizeof (struct arc4random_state), MADV_WIPEONFORK);
-  if (r == 0)
-    __arc4random_wipeonfork = true;
-  else if (errno != EINVAL)
-    arc4random_allocate_failure ();
-#endif
-
-  chacha20_init (state->ctx, buf, buf + CHACHA20_KEY_SIZE);
-}
-
 #define min(x,y) (((x) > (y)) ? (y) : (x))
 
 static void
-arc4random_rekey (uint8_t *rnd, size_t rndlen)
+arc4random_rekey (struct arc4random_state *state, uint8_t *rnd, size_t rndlen)
 {
   memset (state->buf, 0, sizeof state->buf);
   chacha20_crypt (state->ctx, state->buf, state->buf, sizeof state->buf);
@@ -152,41 +104,41 @@  arc4random_getentropy (uint8_t *rnd, size_t len)
   arc4random_getrandom_failure ();
 }
 
-/* Either allocates the state buffer or reinit it by reseeding the cipher
-   state with kernel entropy.  */
-static void
-arc4random_stir (void)
+/* Reinit the thread context by reseeding the cipher state with kernel
+   entropy.  */
+static struct arc4random_state *
+arc4random_check_stir (size_t len)
 {
-  uint8_t rnd[CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE];
-  arc4random_getentropy (rnd, sizeof rnd);
+  struct arc4random_state *state = &__glibc_tls_internal()->rnd_state;
 
-  if (state == NULL)
-    arc4random_init (rnd, sizeof rnd);
-  else
-    arc4random_rekey (rnd, sizeof rnd);
+  if (state->count < len || state->count == -1)
+    {
+      uint8_t rnd[CHACHA20_KEY_SIZE + CHACHA20_IV_SIZE];
+      arc4random_getentropy (rnd, sizeof rnd);
 
-  explicit_bzero (rnd, sizeof rnd);
+      if (state->count > CHACHE_RESEED_SIZE)
+	chacha20_init (state->ctx, rnd, rnd + CHACHA20_KEY_SIZE);
+      else
+	arc4random_rekey (state, rnd, sizeof rnd);
 
-  state->have = 0;
-  memset (state->buf, 0, sizeof state->buf);
-  state->count = CHACHE_RESEED_SIZE;
-}
+      explicit_bzero (rnd, sizeof rnd);
 
-static void
-arc4random_check_stir (size_t len)
-{
-  if (state == NULL || state->count < len)
-    arc4random_stir ();
+      state->have = 0;
+      memset (state->buf, 0, sizeof state->buf);
+      state->count = CHACHE_RESEED_SIZE;
+    }
   if (state->count <= len)
     state->count = 0;
   else
     state->count -= len;
+
+  return state;
 }
 
 void
-__arc4random_buf_internal (void *buffer, size_t len)
+__arc4random_buf (void *buffer, size_t len)
 {
-  arc4random_check_stir (len);
+  struct arc4random_state *state = arc4random_check_stir (len);
 
   while (len > 0)
     {
@@ -201,29 +153,20 @@  __arc4random_buf_internal (void *buffer, size_t len)
 	  state->have -= m;
 	}
       if (state->have == 0)
-	arc4random_rekey (NULL, 0);
+	arc4random_rekey (state, NULL, 0);
     }
 }
-
-void
-__arc4random_buf (void *buffer, size_t len)
-{
-  __libc_lock_lock (__arc4random_lock);
-  __arc4random_buf_internal (buffer, len);
-  __libc_lock_unlock (__arc4random_lock);
-}
 libc_hidden_def (__arc4random_buf)
 weak_alias (__arc4random_buf, arc4random_buf)
 
-
-static uint32_t
-__arc4random_internal (void)
+uint32_t
+__arc4random (void)
 {
   uint32_t r;
 
-  arc4random_check_stir (sizeof (uint32_t));
+  struct arc4random_state *state = arc4random_check_stir (sizeof (uint32_t));
   if (state->have < sizeof (uint32_t))
-    arc4random_rekey (NULL, 0);
+    arc4random_rekey (state, NULL, 0);
   uint8_t *ks = state->buf + sizeof (state->buf) - state->have;
   memcpy (&r, ks, sizeof (uint32_t));
   memset (ks, 0, sizeof (uint32_t));
@@ -231,15 +174,5 @@  __arc4random_internal (void)
 
   return r;
 }
-
-uint32_t
-__arc4random (void)
-{
-  uint32_t r;
-  __libc_lock_lock (__arc4random_lock);
-  r = __arc4random_internal ();
-  __libc_lock_unlock (__arc4random_lock);
-  return r;
-}
 libc_hidden_def (__arc4random)
 weak_alias (__arc4random, arc4random)
diff --git a/stdlib/arc4random.h b/stdlib/arc4random.h
new file mode 100644
index 0000000000..40672299d0
--- /dev/null
+++ b/stdlib/arc4random.h
@@ -0,0 +1,45 @@ 
+/* Arc4random definition used on TLS.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _CHACHA20_H
+#define _CHACHA20_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* Internal ChaCha20 state.  */
+#define CHACHA20_STATE_LEN	16
+#define CHACHA20_BLOCK_SIZE	64
+
+/* Maximum number bytes until reseed (16 MB).  */
+#define CHACHE_RESEED_SIZE	(16 * 1024 * 1024)
+
+/* Internal arc4random buffer, used on each feedback step so offer some
+   backtracking protection and to allow better used of vectorized
+   chacha20 implementations.  */
+#define CHACHA20_BUFSIZE        (8 * CHACHA20_BLOCK_SIZE)
+
+struct arc4random_state
+{
+  uint32_t ctx[CHACHA20_STATE_LEN];
+  size_t have;
+  size_t count;
+  uint8_t buf[CHACHA20_BUFSIZE];
+};
+
+#endif
diff --git a/stdlib/arc4random_uniform.c b/stdlib/arc4random_uniform.c
index 96ffe62df1..7d0140c375 100644
--- a/stdlib/arc4random_uniform.c
+++ b/stdlib/arc4random_uniform.c
@@ -46,7 +46,7 @@  random_bytes (uint32_t *result, uint32_t byte_count)
   unsigned char *ptr = (unsigned char *) result;
   if (__BYTE_ORDER == __BIG_ENDIAN)
     ptr += 4 - byte_count;
-  __arc4random_buf_internal (ptr, byte_count);
+  __arc4random_buf (ptr, byte_count);
 }
 
 static uint32_t
@@ -142,11 +142,7 @@  __libc_lock_define (extern , __arc4random_lock attribute_hidden)
 uint32_t
 __arc4random_uniform (uint32_t upper_bound)
 {
-  uint32_t r;
-  __libc_lock_lock (__arc4random_lock);
-  r = compute_uniform (upper_bound);
-  __libc_lock_unlock (__arc4random_lock);
-  return r;
+  return compute_uniform (upper_bound);
 }
 libc_hidden_def (__arc4random_uniform)
 weak_alias (__arc4random_uniform, arc4random_uniform)
diff --git a/stdlib/chacha20.c b/stdlib/chacha20.c
index fea4994169..0fb55c0fa3 100644
--- a/stdlib/chacha20.c
+++ b/stdlib/chacha20.c
@@ -26,11 +26,8 @@ 
 #define CHACHA20_IV_SIZE	16
 #define CHACHA20_KEY_SIZE	32
 
-#define CHACHA20_BLOCK_SIZE     64
 #define CHACHA20_BLOCK_WORDS    (CHACHA20_BLOCK_SIZE / sizeof (uint32_t))
 
-#define CHACHA20_STATE_LEN	16
-
 /* Defining CHACHA20_XOR_FINAL issues the final XOR using the input as defined
    Sby RFC8439.  Since the input stream will either zero bytes (initial state)
    or the PRNG output itself this step does not add any extra entropy.   */
diff --git a/stdlib/tst-arc4random-chacha20.c b/stdlib/tst-arc4random-chacha20.c
index dd0ef6d8ba..614e6e0736 100644
--- a/stdlib/tst-arc4random-chacha20.c
+++ b/stdlib/tst-arc4random-chacha20.c
@@ -16,11 +16,11 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <arc4random.h>
 #include <support/check.h>
 #include <sys/cdefs.h>
 
 /* It does not define CHACHA20_XOR_FINAL to check what glibc actual uses. */
-#define CHACHA20_BUFSIZE        (8 * CHACHA20_BLOCK_SIZE)
 #include <chacha20.c>
 
 static int
diff --git a/sysdeps/generic/tls-internal-struct.h b/sysdeps/generic/tls-internal-struct.h
index d76c715a96..5d0e2fba53 100644
--- a/sysdeps/generic/tls-internal-struct.h
+++ b/sysdeps/generic/tls-internal-struct.h
@@ -19,10 +19,13 @@ 
 #ifndef _TLS_INTERNAL_STRUCT_H
 #define _TLS_INTERNAL_STRUCT_H 1
 
+#include <stdlib/arc4random.h>
+
 struct tls_internal_t
 {
   char *strsignal_buf;
   char *strerror_l_buf;
+  struct arc4random_state rnd_state;
 };
 
 #endif
diff --git a/sysdeps/unix/sysv/linux/tls-internal.h b/sysdeps/unix/sysv/linux/tls-internal.h
index f7a1a62135..16ff836d05 100644
--- a/sysdeps/unix/sysv/linux/tls-internal.h
+++ b/sysdeps/unix/sysv/linux/tls-internal.h
@@ -22,6 +22,19 @@ 
 #include <stdlib.h>
 #include <pthreadP.h>
 
+static inline void
+__glibc_tls_internal_init (struct tls_internal_t *tls_state)
+{
+  tls_state->strsignal_buf = NULL;
+  tls_state->strerror_l_buf = NULL;
+
+  /* Force key init on created threads.  There is no need to clear the
+     initial state since it will be done either by allocation a new
+     stack (through mmap with MAP_ANONYMOUS) or by the free function
+     below).  */
+  tls_state->rnd_state.count = -1;
+}
+
 static inline struct tls_internal_t *
 __glibc_tls_internal (void)
 {
@@ -31,8 +44,18 @@  __glibc_tls_internal (void)
 static inline void
 __glibc_tls_internal_free (void)
 {
-  free (THREAD_SELF->tls_state.strsignal_buf);
-  free (THREAD_SELF->tls_state.strerror_l_buf);
+  struct pthread *self = THREAD_SELF;
+  free (self->tls_state.strsignal_buf);
+  free (self->tls_state.strerror_l_buf);
+  if (self->tls_state.rnd_state.count != -1)
+    {
+      /* Clear any lingering random state prior so if the thread stack
+	 is cached it won't leak any data.  */
+      memset (&self->tls_state.rnd_state, 0,
+	      sizeof self->tls_state.rnd_state);
+      /* Force key init on created threads.  */
+      self->tls_state.rnd_state.count = -1;
+    }
 }
 
 #endif